From fce09ea314505a52f2436397608fa0a5d0934fb1 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Sat, 2 Sep 2023 08:48:38 +0800 Subject: apparmor: Fix null pointer deref when receiving skb during sock creation The panic below is observed when receiving ICMP packets with secmark set while an ICMP raw socket is being created. SK_CTX(sk)->label is updated in apparmor_socket_post_create(), but the packet is delivered to the socket before that, causing the null pointer dereference. Drop the packet if label context is not set. BUG: kernel NULL pointer dereference, address: 000000000000004c #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 407 Comm: a.out Not tainted 6.4.12-arch1-1 #1 3e6fa2753a2d75925c34ecb78e22e85a65d083df Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/28/2020 RIP: 0010:aa_label_next_confined+0xb/0x40 Code: 00 00 48 89 ef e8 d5 25 0c 00 e9 66 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 89 f0 <8b> 77 4c 39 c6 7e 1f 48 63 d0 48 8d 14 d7 eb 0b 83 c0 01 48 83 c2 RSP: 0018:ffffa92940003b08 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000000e RDX: ffffa92940003be8 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff8b57471e7800 R08: ffff8b574c642400 R09: 0000000000000002 R10: ffffffffbd820eeb R11: ffffffffbeb7ff00 R12: ffff8b574c642400 R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 FS: 00007fb092ea7640(0000) GS:ffff8b577bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000004c CR3: 00000001020f2005 CR4: 00000000007706f0 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x171/0x4e0 ? exc_page_fault+0x7f/0x180 ? asm_exc_page_fault+0x26/0x30 ? aa_label_next_confined+0xb/0x40 apparmor_secmark_check+0xec/0x330 security_sock_rcv_skb+0x35/0x50 sk_filter_trim_cap+0x47/0x250 sock_queue_rcv_skb_reason+0x20/0x60 raw_rcv+0x13c/0x210 raw_local_deliver+0x1f3/0x250 ip_protocol_deliver_rcu+0x4f/0x2f0 ip_local_deliver_finish+0x76/0xa0 __netif_receive_skb_one_core+0x89/0xa0 netif_receive_skb+0x119/0x170 ? __netdev_alloc_skb+0x3d/0x140 vmxnet3_rq_rx_complete+0xb23/0x1010 [vmxnet3 56a84f9c97178c57a43a24ec073b45a9d6f01f3a] vmxnet3_poll_rx_only+0x36/0xb0 [vmxnet3 56a84f9c97178c57a43a24ec073b45a9d6f01f3a] __napi_poll+0x28/0x1b0 net_rx_action+0x2a4/0x380 __do_softirq+0xd1/0x2c8 __irq_exit_rcu+0xbb/0xf0 common_interrupt+0x86/0xa0 asm_common_interrupt+0x26/0x40 RIP: 0010:apparmor_socket_post_create+0xb/0x200 Code: 08 48 85 ff 75 a1 eb b1 0f 1f 80 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 41 54 <55> 48 89 fd 53 45 85 c0 0f 84 b2 00 00 00 48 8b 1d 80 56 3f 02 48 RSP: 0018:ffffa92940ce7e50 EFLAGS: 00000286 RAX: ffffffffbc756440 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff8b574eaab740 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8b57444cec70 R11: 0000000000000000 R12: 0000000000000003 R13: 0000000000000002 R14: ffff8b574eaab740 R15: ffffffffbd8e4748 ? __pfx_apparmor_socket_post_create+0x10/0x10 security_socket_post_create+0x4b/0x80 __sock_create+0x176/0x1f0 __sys_socket+0x89/0x100 __x64_sys_socket+0x17/0x20 do_syscall_64+0x5d/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc Fixes: ab9f2115081a ("apparmor: Allow filtering based on secmark policy") Signed-off-by: Xiao Liang Signed-off-by: John Johansen --- security/apparmor/lsm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index cef8c466af80..c2f73d281724 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1304,6 +1304,13 @@ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!skb->secmark) return 0; + /* + * If reach here before socket_post_create hook is called, in which + * case label is null, drop the packet. + */ + if (!ctx->label) + return -EACCES; + return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } -- cgit From 2bc73505a5cd2a18a7a542022722f136c19e3b87 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 1 Feb 2024 17:24:48 +0300 Subject: apparmor: use kvfree_sensitive to free data->data Inside unpack_profile() data->data is allocated using kvmemdup() so it should be freed with the corresponding kvfree_sensitive(). Also add missing data->data release for rhashtable insertion failure path in unpack_profile(). Found by Linux Verification Center (linuxtesting.org). Fixes: e025be0f26d5 ("apparmor: support querying extended trusted helper extra data") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Signed-off-by: John Johansen --- security/apparmor/policy.c | 2 +- security/apparmor/policy_unpack.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 957654d253dd..14df15e35695 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -225,7 +225,7 @@ static void aa_free_data(void *ptr, void *arg) { struct aa_data *data = ptr; - kfree_sensitive(data->data); + kvfree_sensitive(data->data, data->size); kfree_sensitive(data->key); kfree_sensitive(data); } diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 5e578ef0ddff..75452acd0e35 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1071,6 +1071,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (rhashtable_insert_fast(profile->data, &data->head, profile->data->p)) { + kvfree_sensitive(data->data, data->size); kfree_sensitive(data->key); kfree_sensitive(data); info = "failed to insert data to table"; -- cgit From 4a8db3678403e34c31df6b99a26414d5e183538c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 4 Mar 2024 16:36:55 +0000 Subject: apparmor: remove useless static inline function is_deleted The inlined function is_deleted is redundant, it is not called at all from any function in security/apparmor/file.c and so it can be removed. Cleans up clang scan build warning: security/apparmor/file.c:153:20: warning: unused function 'is_deleted' [-Wunused-function] Signed-off-by: Colin Ian King Signed-off-by: John Johansen --- security/apparmor/file.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index c03eb7c19f16..d52a5b14dad4 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -144,19 +144,6 @@ int aa_audit_file(const struct cred *subj_cred, return aa_audit(type, profile, &ad, file_audit_cb); } -/** - * is_deleted - test if a file has been completely unlinked - * @dentry: dentry of file to test for deletion (NOT NULL) - * - * Returns: true if deleted else false - */ -static inline bool is_deleted(struct dentry *dentry) -{ - if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) - return true; - return false; -} - static int path_name(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, char *buffer, -- cgit From b2c858148acf96290b9a9af259a04e080a169f51 Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Fri, 15 Mar 2024 13:54:09 +0100 Subject: apparmor: fix typo in kernel doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the typo in the function documentation to please kernel doc warnings. Signed-off-by: Christian Göttsche Reviewed-by: Kees Cook Signed-off-by: John Johansen --- security/apparmor/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index c2f73d281724..717204fba938 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1124,7 +1124,7 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern) * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket - * @ptotocol: protocol of the socket + * @protocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: -- cgit From 3dd384108d53834002be5630132ad5c3f32166ad Mon Sep 17 00:00:00 2001 From: Leesoo Ahn Date: Wed, 8 May 2024 01:12:29 +0900 Subject: apparmor: fix possible NULL pointer dereference profile->parent->dents[AAFS_PROF_DIR] could be NULL only if its parent is made from __create_missing_ancestors(..) and 'ent->old' is NULL in aa_replace_profiles(..). In that case, it must return an error code and the code, -ENOENT represents its state that the path of its parent is not existed yet. BUG: kernel NULL pointer dereference, address: 0000000000000030 PGD 0 P4D 0 PREEMPT SMP PTI CPU: 4 PID: 3362 Comm: apparmor_parser Not tainted 6.8.0-24-generic #24 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:aafs_create.constprop.0+0x7f/0x130 Code: 4c 63 e0 48 83 c4 18 4c 89 e0 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 45 31 d2 c3 cc cc cc cc <4d> 8b 55 30 4d 8d ba a0 00 00 00 4c 89 55 c0 4c 89 ff e8 7a 6a ae RSP: 0018:ffffc9000b2c7c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000000041ed RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc9000b2c7cd8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82baac10 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007be9f22cf740(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000134b08000 CR4: 00000000000006f0 Call Trace: ? show_regs+0x6d/0x80 ? __die+0x24/0x80 ? page_fault_oops+0x99/0x1b0 ? kernelmode_fixup_or_oops+0xb2/0x140 ? __bad_area_nosemaphore+0x1a5/0x2c0 ? find_vma+0x34/0x60 ? bad_area_nosemaphore+0x16/0x30 ? do_user_addr_fault+0x2a2/0x6b0 ? exc_page_fault+0x83/0x1b0 ? asm_exc_page_fault+0x27/0x30 ? aafs_create.constprop.0+0x7f/0x130 ? aafs_create.constprop.0+0x51/0x130 __aafs_profile_mkdir+0x3d6/0x480 aa_replace_profiles+0x83f/0x1270 policy_update+0xe3/0x180 profile_load+0xbc/0x150 ? rw_verify_area+0x47/0x140 vfs_write+0x100/0x480 ? __x64_sys_openat+0x55/0xa0 ? syscall_exit_to_user_mode+0x86/0x260 ksys_write+0x73/0x100 __x64_sys_write+0x19/0x30 x64_sys_call+0x7e/0x25c0 do_syscall_64+0x7f/0x180 entry_SYSCALL_64_after_hwframe+0x78/0x80 RIP: 0033:0x7be9f211c574 Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d d5 ea 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 RSP: 002b:00007ffd26f2b8c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00005d504415e200 RCX: 00007be9f211c574 RDX: 0000000000001fc1 RSI: 00005d504418bc80 RDI: 0000000000000004 RBP: 0000000000001fc1 R08: 0000000000001fc1 R09: 0000000080000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00005d504418bc80 R13: 0000000000000004 R14: 00007ffd26f2b9b0 R15: 00007ffd26f2ba30 Modules linked in: snd_seq_dummy snd_hrtimer qrtr snd_hda_codec_generic snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device i2c_i801 snd_timer i2c_smbus qxl snd soundcore drm_ttm_helper lpc_ich ttm joydev input_leds serio_raw mac_hid binfmt_misc msr parport_pc ppdev lp parport efi_pstore nfnetlink dmi_sysfs qemu_fw_cfg ip_tables x_tables autofs4 hid_generic usbhid hid ahci libahci psmouse virtio_rng xhci_pci xhci_pci_renesas CR2: 0000000000000030 ---[ end trace 0000000000000000 ]--- RIP: 0010:aafs_create.constprop.0+0x7f/0x130 Code: 4c 63 e0 48 83 c4 18 4c 89 e0 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 45 31 d2 c3 cc cc cc cc <4d> 8b 55 30 4d 8d ba a0 00 00 00 4c 89 55 c0 4c 89 ff e8 7a 6a ae RSP: 0018:ffffc9000b2c7c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000000041ed RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc9000b2c7cd8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82baac10 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007be9f22cf740(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000134b08000 CR4: 00000000000006f0 Signed-off-by: Leesoo Ahn Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index bcfea073e3f2..01b923d97a44 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1692,6 +1692,10 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) struct aa_profile *p; p = aa_deref_parent(profile); dent = prof_dir(p); + if (!dent) { + error = -ENOENT; + goto fail2; + } /* adding to parent that previously didn't have children */ dent = aafs_create_dir("profiles", dent); if (IS_ERR(dent)) -- cgit From 1bbf5a2156fa246ab54a15412116458006462038 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Apr 2024 19:03:14 +0300 Subject: auxdisplay: charlcd: Provide a forward declaration While there is no compilation error, strictly speaking the compiler should know about used types beforehand. Provide a forward declaration for struct charlcd_ops before using it in struct charlcd. Signed-off-by: Andy Shevchenko --- drivers/auxdisplay/charlcd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/auxdisplay/charlcd.h b/drivers/auxdisplay/charlcd.h index eed80063a6d2..4d4287209d04 100644 --- a/drivers/auxdisplay/charlcd.h +++ b/drivers/auxdisplay/charlcd.h @@ -36,6 +36,8 @@ enum charlcd_lines { CHARLCD_LINES_2, }; +struct charlcd_ops; + struct charlcd { const struct charlcd_ops *ops; const unsigned char *char_conv; /* Optional */ -- cgit From b4cf5fc01ce83e5c0bcf3dbb9f929428646b9098 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 30 May 2024 23:54:55 -0400 Subject: powerpc: fix a file leak in kvm_vcpu_ioctl_enable_cap() missing fdput() on one of the failure exits Fixes: eacc56bb9de3e # v5.2 Signed-off-by: Al Viro --- arch/powerpc/kvm/powerpc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d32abe7fe6ab..d11767208bfc 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1984,8 +1984,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; r = -ENXIO; - if (!xive_enabled()) + if (!xive_enabled()) { + fdput(f); break; + } r = -EPERM; dev = kvm_device_from_filp(f.file); -- cgit From bba1f6758a9ec90c1adac5dcf78f8a15f1bad65b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 30 May 2024 23:58:26 -0400 Subject: lirc: rc_dev_get_from_fd(): fix file leak missing fdput() on a failure exit Fixes: 6a9d552483d50 "media: rc: bpf attach/detach requires write permission" # v6.9 Signed-off-by: Al Viro --- drivers/media/rc/lirc_dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 52aea4167718..717c441b4a86 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -828,8 +828,10 @@ struct rc_dev *rc_dev_get_from_fd(int fd, bool write) return ERR_PTR(-EINVAL); } - if (write && !(f.file->f_mode & FMODE_WRITE)) + if (write && !(f.file->f_mode & FMODE_WRITE)) { + fdput(f); return ERR_PTR(-EPERM); + } fh = f.file->private_data; dev = fh->rc; -- cgit From c8ffef985af564c1b4fc09fdb5a5b336bfc0c33c Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 31 May 2024 11:20:54 +1200 Subject: auxdisplay: linedisp: Support configuring the boot message Like we do for charlcd, allow the configuration of the initial message on line-display devices. Signed-off-by: Chris Packham Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko --- drivers/auxdisplay/Kconfig | 2 +- drivers/auxdisplay/line-display.c | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 69d2138d7efb..21545ffba065 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -316,7 +316,7 @@ endif # PARPORT_PANEL config PANEL_CHANGE_MESSAGE bool "Change LCD initialization message ?" - depends on CHARLCD + depends on CHARLCD || LINEDISP help This allows you to replace the boot message indicating the kernel version and the driver version with a custom message. This is useful on appliances diff --git a/drivers/auxdisplay/line-display.c b/drivers/auxdisplay/line-display.c index e2b546210f8d..978dc457ca5f 100644 --- a/drivers/auxdisplay/line-display.c +++ b/drivers/auxdisplay/line-display.c @@ -8,7 +8,9 @@ * Copyright (C) 2021 Glider bv */ +#ifndef CONFIG_PANEL_BOOT_MESSAGE #include +#endif #include #include @@ -312,6 +314,12 @@ static int linedisp_init_map(struct linedisp *linedisp) return 0; } +#ifdef CONFIG_PANEL_BOOT_MESSAGE +#define LINEDISP_INIT_TEXT CONFIG_PANEL_BOOT_MESSAGE +#else +#define LINEDISP_INIT_TEXT "Linux " UTS_RELEASE " " +#endif + /** * linedisp_register - register a character line display * @linedisp: pointer to character line display structure @@ -359,7 +367,7 @@ int linedisp_register(struct linedisp *linedisp, struct device *parent, goto out_del_timer; /* display a default message */ - err = linedisp_display(linedisp, "Linux " UTS_RELEASE " ", -1); + err = linedisp_display(linedisp, LINEDISP_INIT_TEXT, -1); if (err) goto out_del_dev; -- cgit From 4adbf7086dfdd6101508fd070cb1c0e0adc9513e Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 20:02:01 -0700 Subject: auxdisplay: linedisp: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/auxdisplay/line-display.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko --- drivers/auxdisplay/line-display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/auxdisplay/line-display.c b/drivers/auxdisplay/line-display.c index 978dc457ca5f..731ffdfafc4e 100644 --- a/drivers/auxdisplay/line-display.c +++ b/drivers/auxdisplay/line-display.c @@ -396,4 +396,5 @@ void linedisp_unregister(struct linedisp *linedisp) } EXPORT_SYMBOL_NS_GPL(linedisp_unregister, LINEDISP); +MODULE_DESCRIPTION("Character line display core support"); MODULE_LICENSE("GPL"); -- cgit From c56a45064e989998a616aadf88023cc9611416b7 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 19:50:15 -0700 Subject: auxdisplay: hd44780: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/auxdisplay/hd44780_common.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko --- drivers/auxdisplay/hd44780_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/auxdisplay/hd44780_common.c b/drivers/auxdisplay/hd44780_common.c index 7cbf375b0fa5..4ef87c3118c0 100644 --- a/drivers/auxdisplay/hd44780_common.c +++ b/drivers/auxdisplay/hd44780_common.c @@ -366,4 +366,5 @@ struct hd44780_common *hd44780_common_alloc(void) } EXPORT_SYMBOL_GPL(hd44780_common_alloc); +MODULE_DESCRIPTION("Common functions for HD44780 (and compatibles) LCD displays"); MODULE_LICENSE("GPL"); -- cgit From 632691ad83197227adc902793a2a3e10c04fe9b8 Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Sun, 2 Jun 2024 10:49:22 +0200 Subject: auxdisplay: Use sizeof(*pointer) instead of sizeof(type) It is preferred to use sizeof(*pointer) instead of sizeof(type) due to the type of the variable can change and one needs not change the former (unlike the latter). This patch has no effect on runtime behavior. Signed-off-by: Erick Archer Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko --- drivers/auxdisplay/arm-charlcd.c | 2 +- drivers/auxdisplay/hd44780.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/arm-charlcd.c b/drivers/auxdisplay/arm-charlcd.c index 0b1c99cca733..a7eae99a48f7 100644 --- a/drivers/auxdisplay/arm-charlcd.c +++ b/drivers/auxdisplay/arm-charlcd.c @@ -270,7 +270,7 @@ static int __init charlcd_probe(struct platform_device *pdev) struct charlcd *lcd; struct resource *res; - lcd = kzalloc(sizeof(struct charlcd), GFP_KERNEL); + lcd = kzalloc(sizeof(*lcd), GFP_KERNEL); if (!lcd) return -ENOMEM; diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index 7ac0b1b1d548..025dc6855cb2 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -230,7 +230,7 @@ static int hd44780_probe(struct platform_device *pdev) if (!lcd) goto fail1; - hd = kzalloc(sizeof(struct hd44780), GFP_KERNEL); + hd = kzalloc(sizeof(*hd), GFP_KERNEL); if (!hd) goto fail2; -- cgit From 9b69b52cdde74a38c5ab4d89405b2bd384ec0155 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 20 May 2024 23:30:05 +0100 Subject: ARM: 9400/1: Remove unused struct 'mod_unwind_map' I think this has been unused since Commit b6f21d14f1ac ("ARM: 9204/2: module: Add all unwind tables when load module") Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/module.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 677f218f7e84..da488d92e7a0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -395,11 +395,6 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, return 0; } -struct mod_unwind_map { - const Elf_Shdr *unw_sec; - const Elf_Shdr *txt_sec; -}; - static const Elf_Shdr *find_mod_section(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs, const char *name) { -- cgit From 8ede71e1202011d8bfceeab4737e6d52d88688ab Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 May 2024 13:07:42 +0100 Subject: ARM: 9402/1: Kconfig: Spelling s/Cortex A-/Cortex-A/ Fix a misspelling of "Cortex-A9", to make it easier to find which errata are applicable to Cortex-A9 CPU cores. Signed-off-by: Geert Uytterhoeven Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ee5115252aac..9e0749b22446 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -735,7 +735,7 @@ config ARM_ERRATA_764319 bool "ARM errata: Read to DBGPRSR and DBGOSLSR may generate Undefined instruction" depends on CPU_V7 help - This option enables the workaround for the 764319 Cortex A-9 erratum. + This option enables the workaround for the 764319 Cortex-A9 erratum. CP14 read accesses to the DBGPRSR and DBGOSLSR registers generate an unexpected Undefined Instruction exception when the DBGSWENABLE external pin is set to 0, even when the CP14 accesses are performed -- cgit From 9ac7ba12d37b4c24f6db675ff7edb7e54f597a59 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 May 2024 13:09:24 +0100 Subject: ARM: 9403/1: Alpine: Spelling s/initialiing/initializing/ Fix a misspelling of "initializing". Signed-off-by: Geert Uytterhoeven Signed-off-by: Russell King (Oracle) --- arch/arm/mach-alpine/alpine_cpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-alpine/alpine_cpu_pm.c b/arch/arm/mach-alpine/alpine_cpu_pm.c index 13ae8412e9ce..b48da6f12b6c 100644 --- a/arch/arm/mach-alpine/alpine_cpu_pm.c +++ b/arch/arm/mach-alpine/alpine_cpu_pm.c @@ -29,7 +29,7 @@ int alpine_cpu_wakeup(unsigned int phys_cpu, uint32_t phys_resume_addr) /* * Set CPU resume address - * secure firmware running on boot will jump to this address - * after setting proper CPU mode, and initialiing e.g. secure + * after setting proper CPU mode, and initializing e.g. secure * regs (the same mode all CPUs are booted to - usually HYP) */ writel(phys_resume_addr, -- cgit From ed0f941022515ff40473ea5335769a5dc2524a3f Mon Sep 17 00:00:00 2001 From: Yuntao Liu Date: Mon, 3 Jun 2024 16:37:50 +0100 Subject: ARM: 9404/1: arm32: enable HAVE_LD_DEAD_CODE_DATA_ELIMINATION The current arm32 architecture does not yet support the HAVE_LD_DEAD_CODE_DATA_ELIMINATION feature. arm32 is widely used in embedded scenarios, and enabling this feature would be beneficial for reducing the size of the kernel image. In order to make this work, we keep the necessary tables by annotating them with KEEP, also it requires further changes to linker script to KEEP some tables and wildcard compiler generated sections into the right place. When using ld.lld for linking, KEEP is not recognized within the OVERLAY command, and Ard proposed a concise method to solve this problem. It boots normally with defconfig, vexpress_defconfig and tinyconfig. The size comparison of zImage is as follows: defconfig vexpress_defconfig tinyconfig 5137712 5138024 424192 no dce 5032560 4997824 298384 dce 2.0% 2.7% 29.7% shrink When using smaller config file, there is a significant reduction in the size of the zImage. We also tested this patch on a commercially available single-board computer, and the comparison is as follows: a15eb_config 2161384 no dce 2092240 dce 3.2% shrink The zImage size has been reduced by approximately 3.2%, which is 70KB on 2.1M. Signed-off-by: Yuntao Liu Tested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 1 + arch/arm/boot/compressed/vmlinux.lds.S | 2 +- arch/arm/include/asm/vmlinux.lds.h | 2 +- arch/arm/kernel/entry-armv.S | 3 +++ arch/arm/kernel/vmlinux-xip.lds.S | 4 ++-- arch/arm/kernel/vmlinux.lds.S | 6 +++--- drivers/firmware/efi/libstub/Makefile | 4 ++++ 7 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9e0749b22446..08572703d94d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -115,6 +115,7 @@ config ARM select HAVE_KERNEL_XZ select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M select HAVE_KRETPROBES if HAVE_KPROBES + select HAVE_LD_DEAD_CODE_DATA_ELIMINATION select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPTPROBES if !THUMB2_KERNEL diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S index 3fcb3e62dc56..d411abd4310e 100644 --- a/arch/arm/boot/compressed/vmlinux.lds.S +++ b/arch/arm/boot/compressed/vmlinux.lds.S @@ -125,7 +125,7 @@ SECTIONS . = BSS_START; __bss_start = .; - .bss : { *(.bss) } + .bss : { *(.bss .bss.*) } _end = .; . = ALIGN(8); /* the stack must be 64-bit aligned */ diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index 4c8632d5c432..d60f6e83a9f7 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -42,7 +42,7 @@ #define PROC_INFO \ . = ALIGN(4); \ __proc_info_begin = .; \ - *(.proc.info.init) \ + KEEP(*(.proc.info.init)) \ __proc_info_end = .; #define IDMAP_TEXT \ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 6150a716828c..f01d23a220e6 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1065,6 +1065,7 @@ vector_addrexcptn: .globl vector_fiq .section .vectors, "ax", %progbits + .reloc .text, R_ARM_NONE, . W(b) vector_rst W(b) vector_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi ) @@ -1078,6 +1079,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi ) #ifdef CONFIG_HARDEN_BRANCH_HISTORY .section .vectors.bhb.loop8, "ax", %progbits + .reloc .text, R_ARM_NONE, . W(b) vector_rst W(b) vector_bhb_loop8_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi ) @@ -1090,6 +1092,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi ) W(b) vector_bhb_loop8_fiq .section .vectors.bhb.bpiall, "ax", %progbits + .reloc .text, R_ARM_NONE, . W(b) vector_rst W(b) vector_bhb_bpiall_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi ) diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index c16d196b5aad..5eddb75a7174 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -63,7 +63,7 @@ SECTIONS . = ALIGN(4); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; - ARM_MMU_KEEP(*(__ex_table)) + ARM_MMU_KEEP(KEEP(*(__ex_table))) __stop___ex_table = .; } @@ -83,7 +83,7 @@ SECTIONS } .init.arch.info : { __arch_info_begin = .; - *(.arch.info.init) + KEEP(*(.arch.info.init)) __arch_info_end = .; } .init.tagtable : { diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index bd9127c4b451..de373c6c2ae8 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -74,7 +74,7 @@ SECTIONS . = ALIGN(4); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; - ARM_MMU_KEEP(*(__ex_table)) + ARM_MMU_KEEP(KEEP(*(__ex_table))) __stop___ex_table = .; } @@ -99,7 +99,7 @@ SECTIONS } .init.arch.info : { __arch_info_begin = .; - *(.arch.info.init) + KEEP(*(.arch.info.init)) __arch_info_end = .; } .init.tagtable : { @@ -116,7 +116,7 @@ SECTIONS #endif .init.pv_table : { __pv_table_begin = .; - *(.pv_table) + KEEP(*(.pv_table)) __pv_table_end = .; } diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 06f0428a723c..5a8a9b3429b5 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -56,6 +56,10 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) # disable LTO KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) +# The .data section would be renamed to .data.efistub, therefore, remove +# `-fdata-sections` flag from KBUILD_CFLAGS_KERNEL +KBUILD_CFLAGS_KERNEL := $(filter-out -fdata-sections, $(KBUILD_CFLAGS_KERNEL)) + lib-y := efi-stub-helper.o gop.o secureboot.o tpm.o \ file.o mem.o random.o randomalloc.o pci.o \ skip_spaces.o lib-cmdline.o lib-ctype.o \ -- cgit From 2ccfe94bc3ac980d2d1df9f7a0b2c6d2137abe55 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 4 Jun 2024 17:02:15 +0200 Subject: auxdisplay: ht16k33: Drop reference after LED registration The reference count is bumped by device_get_named_child_node() and never dropped. Since LED APIs do not require it to be bumped by the user, drop the reference after LED registration. [andy: rewritten the commit message and amended the change] Fixes: c223d9c636ed ("auxdisplay: ht16k33: Add LED support") Signed-off-by: Markus Elfring Signed-off-by: Andy Shevchenko --- drivers/auxdisplay/ht16k33.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c index ce987944662c..8a7034b41d50 100644 --- a/drivers/auxdisplay/ht16k33.c +++ b/drivers/auxdisplay/ht16k33.c @@ -483,6 +483,7 @@ static int ht16k33_led_probe(struct device *dev, struct led_classdev *led, led->max_brightness = MAX_BRIGHTNESS; err = devm_led_classdev_register_ext(dev, led, &init_data); + fwnode_handle_put(init_data.fwnode); if (err) dev_err(dev, "Failed to register LED\n"); -- cgit From 5c563ee90a22d3295bcd6217e3ecd7bf9f4d9d48 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 24 May 2024 11:58:28 -0700 Subject: cpumask: introduce assign_cpu() macro Now that assign_bit() is a thin macro wrapper around set_bit() and clear_bit(), we can use it in cpumask API and drop duplicating implementations of set_cpu_xxx() helpers with no additional overhead. Bloat-o-meter reports almost 2k less of generated code for allyesconfig, mostly in kernel/cpu.c: add/remove: 2/4 grow/shrink: 3/4 up/down: 498/-2228 (-1730) Reviewed-by: Alexander Lobakin Signed-off-by: Yury Norov --- include/linux/cpumask.h | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 23686bed441d..18410acdbc9e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1083,44 +1083,16 @@ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); -static inline void -set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, &__cpu_possible_mask); - else - cpumask_clear_cpu(cpu, &__cpu_possible_mask); -} +#define assign_cpu(cpu, mask, val) \ + assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -static inline void -set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, &__cpu_present_mask); - else - cpumask_clear_cpu(cpu, &__cpu_present_mask); -} +#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) +#define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) +#define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) +#define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); -static inline void -set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, &__cpu_active_mask); - else - cpumask_clear_cpu(cpu, &__cpu_active_mask); -} - -static inline void -set_cpu_dying(unsigned int cpu, bool dying) -{ - if (dying) - cpumask_set_cpu(cpu, &__cpu_dying_mask); - else - cpumask_clear_cpu(cpu, &__cpu_dying_mask); -} - /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap -- cgit From e334771d83ec14f755a554394162198a955e3faa Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 31 May 2024 09:03:11 -0700 Subject: lib: bitmap: add missing MODULE_DESCRIPTION() macros make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/find_bit_benchmark.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/cpumask_kunit.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/test_bitmap.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Yury Norov --- lib/cpumask_kunit.c | 1 + lib/find_bit_benchmark.c | 1 + lib/test_bitmap.c | 1 + 3 files changed, 3 insertions(+) diff --git a/lib/cpumask_kunit.c b/lib/cpumask_kunit.c index a105e6369efc..6b62a6bdd50e 100644 --- a/lib/cpumask_kunit.c +++ b/lib/cpumask_kunit.c @@ -152,4 +152,5 @@ static struct kunit_suite test_cpumask_suite = { }; kunit_test_suite(test_cpumask_suite); +MODULE_DESCRIPTION("KUnit tests for cpumask"); MODULE_LICENSE("GPL"); diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index d3fb09e6eff1..402e160e7186 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -194,4 +194,5 @@ static int __init find_bit_test(void) } module_init(find_bit_test); +MODULE_DESCRIPTION("Test for find_*_bit functions"); MODULE_LICENSE("GPL"); diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 6dfb8d46a4ff..65a75d58ed9e 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -1486,4 +1486,5 @@ static void __init selftest(void) KSTM_MODULE_LOADERS(test_bitmap); MODULE_AUTHOR("david decotigny "); +MODULE_DESCRIPTION("Test cases for bitmap API"); MODULE_LICENSE("GPL"); -- cgit From e0eeb938adb0367de4b0946125a06142d8de7d37 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jun 2024 15:38:12 +0300 Subject: bitops: Add a comment explaining the double underscore macros Linus Walleij pointed out that a new comer might be confused about the difference between set_bit() and __set_bit(). Add a comment explaining the difference. Link: https://lore.kernel.org/all/CACRpkdZFPG_YLici-BmYfk9HZ36f4WavCN3JNotkk8cPgCODCg@mail.gmail.com/ Signed-off-by: Dan Carpenter Reviewed-by: Linus Walleij Signed-off-by: Yury Norov --- include/linux/bitops.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 46d4bdc634c0..ba35bbf07798 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -47,12 +47,17 @@ extern unsigned long __sw_hweight64(__u64 w); __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) +/* + * The following macros are non-atomic versions of their non-underscored + * counterparts. + */ #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) + #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) -- cgit From 7bd3d76a1f9fb38c8234cfadb7131e0b26deb919 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 8 Mar 2024 18:32:15 +0000 Subject: unicode: make utf8 test count static The variables failed_tests and total_tests are not used outside of the utf8-selftest.c file so make them static to avoid the following warnings: fs/unicode/utf8-selftest.c:17:14: warning: symbol 'failed_tests' was not declared. Should it be static? fs/unicode/utf8-selftest.c:18:14: warning: symbol 'total_tests' was not declared. Should it be static? Signed-off-by: Ben Dooks Link: https://lore.kernel.org/r/20240308183215.1924331-1-ben.dooks@codethink.co.uk Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/utf8-selftest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index eb2bbdd688d7..c928e6007356 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -14,8 +14,8 @@ #include "utf8n.h" -unsigned int failed_tests; -unsigned int total_tests; +static unsigned int failed_tests; +static unsigned int total_tests; /* Tests will be based on this version. */ #define UTF8_LATEST UNICODE_AGE(12, 1, 0) -- cgit From 68318904a7758e11f16fa9d202a6df60f896e71a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 24 May 2024 11:48:09 -0700 Subject: unicode: add MODULE_DESCRIPTION() macros Currently 'make W=1' reports: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/unicode/utf8data.o WARNING: modpost: missing MODULE_DESCRIPTION() in fs/unicode/utf8-selftest.o Add a MODULE_DESCRIPTION() to utf8-selftest.c and utf8data.c_shipped, and update mkutf8data.c to add a MODULE_DESCRIPTION() to any future generated utf8data file. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240524-md-unicode-v1-1-e2727ce8574d@quicinc.com Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/mkutf8data.c | 1 + fs/unicode/utf8-selftest.c | 1 + fs/unicode/utf8data.c_shipped | 1 + 3 files changed, 3 insertions(+) diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index bc1a7c8b5c8d..77b685db8275 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -3352,6 +3352,7 @@ static void write_file(void) fprintf(file, "};\n"); fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); fprintf(file, "\n"); + fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n"); fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); fclose(file); } diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index c928e6007356..600e15efe9ed 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -307,4 +307,5 @@ module_init(init_test_ucd); module_exit(exit_test_ucd); MODULE_AUTHOR("Gabriel Krisman Bertazi "); +MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8data.c_shipped b/fs/unicode/utf8data.c_shipped index d9b62901aa96..dafa5fed761d 100644 --- a/fs/unicode/utf8data.c_shipped +++ b/fs/unicode/utf8data.c_shipped @@ -4120,4 +4120,5 @@ struct utf8data_table utf8_data_table = { .utf8data = utf8data, }; EXPORT_SYMBOL_GPL(utf8_data_table); +MODULE_DESCRIPTION("UTF8 data table"); MODULE_LICENSE("GPL v2"); -- cgit From 1b3bf0747d4f1a963e59c26e602868bdce195318 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 11 Jun 2024 10:19:50 -0700 Subject: tools/power/turbostat: Switch to new Intel CPU model defines New CPU #defines encode vendor and family as well as model. N.B. Copied VFM_*() defines here from to avoid an application picking a second internal kernel header file. Signed-off-by: Tony Luck Acked-by: Rafael J. Wysocki Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 165 +++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 70 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9f5d053d4bc6..02312e62c857 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9,6 +9,30 @@ #define _GNU_SOURCE #include MSRHEADER + +// copied from arch/x86/include/asm/cpu_device_id.h +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) +// end copied section + +#define X86_VENDOR_INTEL 0 + #include INTEL_FAMILY_HEADER #include BUILD_BUG_HEADER #include @@ -367,7 +391,7 @@ struct platform_features { }; struct platform_data { - unsigned int model; + unsigned int vfm; const struct platform_features *features; }; @@ -910,75 +934,75 @@ static const struct platform_features amd_features_with_rapl = { }; static const struct platform_data turbostat_pdata[] = { - { INTEL_FAM6_NEHALEM, &nhm_features }, - { INTEL_FAM6_NEHALEM_G, &nhm_features }, - { INTEL_FAM6_NEHALEM_EP, &nhm_features }, - { INTEL_FAM6_NEHALEM_EX, &nhx_features }, - { INTEL_FAM6_WESTMERE, &nhm_features }, - { INTEL_FAM6_WESTMERE_EP, &nhm_features }, - { INTEL_FAM6_WESTMERE_EX, &nhx_features }, - { INTEL_FAM6_SANDYBRIDGE, &snb_features }, - { INTEL_FAM6_SANDYBRIDGE_X, &snx_features }, - { INTEL_FAM6_IVYBRIDGE, &ivb_features }, - { INTEL_FAM6_IVYBRIDGE_X, &ivx_features }, - { INTEL_FAM6_HASWELL, &hsw_features }, - { INTEL_FAM6_HASWELL_X, &hsx_features }, - { INTEL_FAM6_HASWELL_L, &hswl_features }, - { INTEL_FAM6_HASWELL_G, &hswg_features }, - { INTEL_FAM6_BROADWELL, &bdw_features }, - { INTEL_FAM6_BROADWELL_G, &bdwg_features }, - { INTEL_FAM6_BROADWELL_X, &bdx_features }, - { INTEL_FAM6_BROADWELL_D, &bdx_features }, - { INTEL_FAM6_SKYLAKE_L, &skl_features }, - { INTEL_FAM6_SKYLAKE, &skl_features }, - { INTEL_FAM6_SKYLAKE_X, &skx_features }, - { INTEL_FAM6_KABYLAKE_L, &skl_features }, - { INTEL_FAM6_KABYLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE_L, &skl_features }, - { INTEL_FAM6_CANNONLAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_X, &icx_features }, - { INTEL_FAM6_ICELAKE_D, &icx_features }, - { INTEL_FAM6_ICELAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_NNPI, &cnl_features }, - { INTEL_FAM6_ROCKETLAKE, &cnl_features }, - { INTEL_FAM6_TIGERLAKE_L, &cnl_features }, - { INTEL_FAM6_TIGERLAKE, &cnl_features }, - { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, - { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, - { INTEL_FAM6_GRANITERAPIDS_X, &spr_features }, - { INTEL_FAM6_LAKEFIELD, &cnl_features }, - { INTEL_FAM6_ALDERLAKE, &adl_features }, - { INTEL_FAM6_ALDERLAKE_L, &adl_features }, - { INTEL_FAM6_RAPTORLAKE, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_P, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, - { INTEL_FAM6_METEORLAKE, &cnl_features }, - { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE_H, &arl_features }, - { INTEL_FAM6_ARROWLAKE_U, &arl_features }, - { INTEL_FAM6_ARROWLAKE, &arl_features }, - { INTEL_FAM6_LUNARLAKE_M, &arl_features }, - { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, - { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, - { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, - { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features }, - { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features }, - { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features }, - { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, - { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, - { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, - { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, - { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, - { INTEL_FAM6_ATOM_CRESTMONT, &grr_features }, - { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, - { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, + { INTEL_NEHALEM, &nhm_features }, + { INTEL_NEHALEM_G, &nhm_features }, + { INTEL_NEHALEM_EP, &nhm_features }, + { INTEL_NEHALEM_EX, &nhx_features }, + { INTEL_WESTMERE, &nhm_features }, + { INTEL_WESTMERE_EP, &nhm_features }, + { INTEL_WESTMERE_EX, &nhx_features }, + { INTEL_SANDYBRIDGE, &snb_features }, + { INTEL_SANDYBRIDGE_X, &snx_features }, + { INTEL_IVYBRIDGE, &ivb_features }, + { INTEL_IVYBRIDGE_X, &ivx_features }, + { INTEL_HASWELL, &hsw_features }, + { INTEL_HASWELL_X, &hsx_features }, + { INTEL_HASWELL_L, &hswl_features }, + { INTEL_HASWELL_G, &hswg_features }, + { INTEL_BROADWELL, &bdw_features }, + { INTEL_BROADWELL_G, &bdwg_features }, + { INTEL_BROADWELL_X, &bdx_features }, + { INTEL_BROADWELL_D, &bdx_features }, + { INTEL_SKYLAKE_L, &skl_features }, + { INTEL_SKYLAKE, &skl_features }, + { INTEL_SKYLAKE_X, &skx_features }, + { INTEL_KABYLAKE_L, &skl_features }, + { INTEL_KABYLAKE, &skl_features }, + { INTEL_COMETLAKE, &skl_features }, + { INTEL_COMETLAKE_L, &skl_features }, + { INTEL_CANNONLAKE_L, &cnl_features }, + { INTEL_ICELAKE_X, &icx_features }, + { INTEL_ICELAKE_D, &icx_features }, + { INTEL_ICELAKE_L, &cnl_features }, + { INTEL_ICELAKE_NNPI, &cnl_features }, + { INTEL_ROCKETLAKE, &cnl_features }, + { INTEL_TIGERLAKE_L, &cnl_features }, + { INTEL_TIGERLAKE, &cnl_features }, + { INTEL_SAPPHIRERAPIDS_X, &spr_features }, + { INTEL_EMERALDRAPIDS_X, &spr_features }, + { INTEL_GRANITERAPIDS_X, &spr_features }, + { INTEL_LAKEFIELD, &cnl_features }, + { INTEL_ALDERLAKE, &adl_features }, + { INTEL_ALDERLAKE_L, &adl_features }, + { INTEL_RAPTORLAKE, &adl_features }, + { INTEL_RAPTORLAKE_P, &adl_features }, + { INTEL_RAPTORLAKE_S, &adl_features }, + { INTEL_METEORLAKE, &cnl_features }, + { INTEL_METEORLAKE_L, &cnl_features }, + { INTEL_ARROWLAKE_H, &arl_features }, + { INTEL_ARROWLAKE_U, &arl_features }, + { INTEL_ARROWLAKE, &arl_features }, + { INTEL_LUNARLAKE_M, &arl_features }, + { INTEL_ATOM_SILVERMONT, &slv_features }, + { INTEL_ATOM_SILVERMONT_D, &slvd_features }, + { INTEL_ATOM_AIRMONT, &amt_features }, + { INTEL_ATOM_GOLDMONT, &gmt_features }, + { INTEL_ATOM_GOLDMONT_D, &gmtd_features }, + { INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features }, + { INTEL_ATOM_TREMONT_D, &tmtd_features }, + { INTEL_ATOM_TREMONT, &tmt_features }, + { INTEL_ATOM_TREMONT_L, &tmt_features }, + { INTEL_ATOM_GRACEMONT, &adl_features }, + { INTEL_ATOM_CRESTMONT_X, &srf_features }, + { INTEL_ATOM_CRESTMONT, &grr_features }, + { INTEL_XEON_PHI_KNL, &knl_features }, + { INTEL_XEON_PHI_KNM, &knl_features }, /* * Missing support for - * INTEL_FAM6_ICELAKE - * INTEL_FAM6_ATOM_SILVERMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_NP + * INTEL_ICELAKE + * INTEL_ATOM_SILVERMONT_MID + * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, }; @@ -1003,11 +1027,12 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } - if (!genuine_intel || family != 6) + if (!genuine_intel) return; for (i = 0; turbostat_pdata[i].features; i++) { - if (turbostat_pdata[i].model == model) { + if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && + VFM_MODEL(turbostat_pdata[i].vfm) == model) { platform = turbostat_pdata[i].features; return; } -- cgit From c81c8ee445e84703cd90171365ed596ef94399cc Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 12:06:39 +0200 Subject: tools/power turbostat: Remove anonymous union from rapl_counter_info_t fd_perf field used to be part of the union, but later moved out of it, because we test it with fd_perf != -1 to determine if any perf counter is opened, making the union unused. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 02312e62c857..1ac74cd49cc9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1085,15 +1085,9 @@ struct rapl_counter_info_t { unsigned long long flags[NUM_RAPL_COUNTERS]; double scale[NUM_RAPL_COUNTERS]; enum rapl_unit unit[NUM_RAPL_COUNTERS]; - - union { - /* Active when source == RAPL_SOURCE_MSR */ - struct { - unsigned long long msr[NUM_RAPL_COUNTERS]; - unsigned long long msr_mask[NUM_RAPL_COUNTERS]; - int msr_shift[NUM_RAPL_COUNTERS]; - }; - }; + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; int fd_perf; }; -- cgit From 73ed3c941a76ff3348022477975c84b8d183599a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 12:22:57 +0200 Subject: tools/power turbostat: Replace enum rapl_source and cstate_source with counter_source Reuse the enum. It means the same thing in both cases. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 39 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1ac74cd49cc9..0fc6c107e371 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -86,8 +86,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; -enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; -enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; +enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; struct sysfs_path { char path[PATH_BYTES]; @@ -1081,7 +1080,7 @@ enum rapl_unit { struct rapl_counter_info_t { unsigned long long data[NUM_RAPL_COUNTERS]; - enum rapl_source source[NUM_RAPL_COUNTERS]; + enum counter_source source[NUM_RAPL_COUNTERS]; unsigned long long flags[NUM_RAPL_COUNTERS]; double scale[NUM_RAPL_COUNTERS]; enum rapl_unit unit[NUM_RAPL_COUNTERS]; @@ -1243,7 +1242,7 @@ enum ccstate_rci_index { struct cstate_counter_info_t { unsigned long long data[NUM_CSTATE_COUNTERS]; - enum cstate_source source[NUM_CSTATE_COUNTERS]; + enum counter_source source[NUM_CSTATE_COUNTERS]; unsigned long long msr[NUM_CSTATE_COUNTERS]; int fd_perf_core; int fd_perf_pkg; @@ -3601,7 +3600,7 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) size_t ret = 0; for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) - if (rci->source[i] == RAPL_SOURCE_PERF) + if (rci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3612,7 +3611,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t size_t ret = 0; for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) - if (cci->source[i] == CSTATE_SOURCE_PERF) + if (cci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3653,10 +3652,10 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { switch (rci->source[i]) { - case RAPL_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case RAPL_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(rci->fd_perf != -1); @@ -3669,7 +3668,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct ++pi; break; - case RAPL_SOURCE_MSR: + case COUNTER_SOURCE_MSR: if (debug) fprintf(stderr, "Reading rapl counter via msr at %u\n", i); @@ -3791,10 +3790,10 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { switch (cci->source[i]) { - case CSTATE_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case CSTATE_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); @@ -3807,7 +3806,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat ++pi; break; - case CSTATE_SOURCE_MSR: + case COUNTER_SOURCE_MSR: assert(!no_msr); if (get_msr(cpu, cci->msr[i], &cci->data[i])) return -13 - i; @@ -3828,7 +3827,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat * when invoked for the thread sibling. */ #define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ - if (cci->source[index] != CSTATE_SOURCE_NONE) \ + if (cci->source[index] != COUNTER_SOURCE_NONE) \ out_counter = cci->data[index]; \ } while (0) @@ -6894,7 +6893,7 @@ void rapl_perf_init(void) rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; - rci->source[i] = RAPL_SOURCE_NONE; + rci->source[i] = COUNTER_SOURCE_NONE; } } @@ -6936,14 +6935,14 @@ void rapl_perf_init(void) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { - rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->source[cai->rci_index] = COUNTER_SOURCE_PERF; rci->scale[cai->rci_index] = scale * cai->compat_scale; rci->unit[cai->rci_index] = unit; rci->flags[cai->rci_index] = cai->flags; /* Use MSR for this counter */ } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { - rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->source[cai->rci_index] = COUNTER_SOURCE_MSR; rci->msr[cai->rci_index] = cai->msr; rci->msr_mask[cai->rci_index] = cai->msr_mask; rci->msr_shift[cai->rci_index] = cai->msr_shift; @@ -6953,7 +6952,7 @@ void rapl_perf_init(void) } } - if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE) has_counter = 1; } @@ -7146,17 +7145,17 @@ void cstate_perf_init_(bool soft_c1) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { - cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; /* User MSR for this counter */ } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit && probe_msr(cpu, cai->msr) == 0) { - cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } } - if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) { has_counter = true; cores_visited[core_id] = true; pkg_visited[pkg_id] = true; -- cgit From 25e713c6b54a899b5a8b5e09b58f4b87a5b3664b Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 13:04:33 +0200 Subject: tools/power turbostat: Add ZERO_ARRAY for zero initializing builtin array It makes it harder to shoot yourself in the foot, by using additional __must_be_array() check. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 0fc6c107e371..29751b41ea0d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1060,6 +1060,8 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 +#define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr)) + /* Indexes used to map data read from perf and MSRs into global variables */ enum rapl_rci_index { RAPL_RCI_INDEX_ENERGY_PKG = 0, @@ -3733,9 +3735,9 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat assert(ccstate_counter_info); assert(cpu <= ccstate_counter_info_size); - memset(perf_data, 0, sizeof(perf_data)); - memset(perf_data_core, 0, sizeof(perf_data_core)); - memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + ZERO_ARRAY(perf_data); + ZERO_ARRAY(perf_data_core); + ZERO_ARRAY(perf_data_pkg); cci = &ccstate_counter_info[cpu]; -- cgit From 4e7b4ff2dcaed228cb2fb7bfe720262c98ec1bb9 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 27 Jun 2024 08:29:59 +0100 Subject: ARM: 9406/1: Fix callchain_trace() return value perf_callchain_store() return 0 on success, -1 otherwise, fix callchain_trace() to return correct bool value. So walk_stackframe() can have a chance to stop walking the stack ahead. Fixes: 70ccc7c0667b ("ARM: 9258/1: stacktrace: Make stack walk callback consistent with generic code") Signed-off-by: Jinjie Ruan Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/perf_callchain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 7147edbe56c6..1d230ac9d0eb 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -85,8 +85,7 @@ static bool callchain_trace(void *data, unsigned long pc) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, pc); - return true; + return perf_callchain_store(entry, pc) == 0; } void -- cgit From 2335c9cb831faba1a4efcc612886073b6f175fe4 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 27 Jun 2024 08:38:44 +0100 Subject: ARM: 9407/1: Add support for STACKLEAK gcc plugin Add the STACKLEAK gcc plugin to arm32 by adding the helper used by stackleak common code: on_thread_stack(). It initialize the stack with the poison value before returning from system calls which improves the kernel security. Additionally, this disables the plugin in EFI stub code and decompress code, which are out of scope for the protection. Before the test on Qemu versatilepb board: # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT lkdtm: Performing direct entry STACKLEAK_ERASING lkdtm: XFAIL: stackleak is not supported on this arch (HAVE_ARCH_STACKLEAK=n) After: # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT lkdtm: Performing direct entry STACKLEAK_ERASING lkdtm: stackleak stack usage: high offset: 80 bytes current: 280 bytes lowest: 696 bytes tracked: 696 bytes untracked: 192 bytes poisoned: 7220 bytes low offset: 4 bytes lkdtm: OK: the rest of the thread stack is properly erased Signed-off-by: Jinjie Ruan Acked-by: Ard Biesheuvel Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 1 + arch/arm/boot/compressed/Makefile | 1 + arch/arm/include/asm/stacktrace.h | 7 +++++++ arch/arm/kernel/entry-common.S | 3 +++ drivers/firmware/efi/libstub/Makefile | 3 ++- 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 08572703d94d..26a01ad4087c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -86,6 +86,7 @@ config ARM select HAVE_ARCH_PFN_VALID select HAVE_ARCH_SECCOMP select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 6bca03c0c7f0..945b5975fce2 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -9,6 +9,7 @@ OBJS = HEAD = head.o OBJS += misc.o decompress.o +CFLAGS_decompress.o += $(DISABLE_STACKLEAK_PLUGIN) ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o AFLAGS_head.o += -DDEBUG diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h index 360f0d2406bf..f80a85b091d6 100644 --- a/arch/arm/include/asm/stacktrace.h +++ b/arch/arm/include/asm/stacktrace.h @@ -26,6 +26,13 @@ struct stackframe { #endif }; +static inline bool on_thread_stack(void) +{ + unsigned long delta = current_stack_pointer ^ (unsigned long)current->stack; + + return delta < THREAD_SIZE; +} + static __always_inline void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe *frame) { diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 5c31e9de7a60..f379c852dcb7 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -119,6 +119,9 @@ no_work_pending: ct_user_enter save = 0 +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + bl stackleak_erase_on_task_stack +#endif restore_user_regs fast = 0, offset = 0 ENDPROC(ret_to_user_from_irq) ENDPROC(ret_to_user) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 5a8a9b3429b5..59d3dccaa8e5 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -27,7 +27,8 @@ cflags-$(CONFIG_ARM64) += -fpie $(DISABLE_STACKLEAK_PLUGIN) \ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \ -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \ -DEFI_HAVE_STRCMP -fno-builtin -fpic \ - $(call cc-option,-mno-single-pic-base) + $(call cc-option,-mno-single-pic-base) \ + $(DISABLE_STACKLEAK_PLUGIN) cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE -mno-relax cflags-$(CONFIG_LOONGARCH) += -fpie -- cgit From 657a292d679ae3a6c733ab0e939e24ae44b20faf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 27 Jun 2024 09:22:09 +0100 Subject: ARM: 9408/1: mm: CFI: Fix some erroneous reset prototypes I somehow got a few cpu_nn_reset() signatures wrong in my patch. Fix it up. Closes: https://lore.kernel.org/oe-kbuild-all/202406260432.6WGV2jCk-lkp@intel.com/ Fixes: 393999fa9627 ("ARM: 9389/2: mm: Define prototypes for all per-processor calls") Reported-by: kernel test robot Reported-by: Nathan Chancellor Reviewed-by: Sami Tolvanen Signed-off-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/mm/proc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm/mm/proc.c b/arch/arm/mm/proc.c index bdbbf65d1b36..2027845efefb 100644 --- a/arch/arm/mm/proc.c +++ b/arch/arm/mm/proc.c @@ -17,7 +17,7 @@ void cpu_arm7tdmi_proc_init(void); __ADDRESSABLE(cpu_arm7tdmi_proc_init); void cpu_arm7tdmi_proc_fin(void); __ADDRESSABLE(cpu_arm7tdmi_proc_fin); -void cpu_arm7tdmi_reset(void); +void cpu_arm7tdmi_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm7tdmi_reset); int cpu_arm7tdmi_do_idle(void); __ADDRESSABLE(cpu_arm7tdmi_do_idle); @@ -32,7 +32,7 @@ void cpu_arm720_proc_init(void); __ADDRESSABLE(cpu_arm720_proc_init); void cpu_arm720_proc_fin(void); __ADDRESSABLE(cpu_arm720_proc_fin); -void cpu_arm720_reset(void); +void cpu_arm720_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm720_reset); int cpu_arm720_do_idle(void); __ADDRESSABLE(cpu_arm720_do_idle); @@ -49,7 +49,7 @@ void cpu_arm740_proc_init(void); __ADDRESSABLE(cpu_arm740_proc_init); void cpu_arm740_proc_fin(void); __ADDRESSABLE(cpu_arm740_proc_fin); -void cpu_arm740_reset(void); +void cpu_arm740_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm740_reset); int cpu_arm740_do_idle(void); __ADDRESSABLE(cpu_arm740_do_idle); @@ -64,7 +64,7 @@ void cpu_arm9tdmi_proc_init(void); __ADDRESSABLE(cpu_arm9tdmi_proc_init); void cpu_arm9tdmi_proc_fin(void); __ADDRESSABLE(cpu_arm9tdmi_proc_fin); -void cpu_arm9tdmi_reset(void); +void cpu_arm9tdmi_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm9tdmi_reset); int cpu_arm9tdmi_do_idle(void); __ADDRESSABLE(cpu_arm9tdmi_do_idle); @@ -79,7 +79,7 @@ void cpu_arm920_proc_init(void); __ADDRESSABLE(cpu_arm920_proc_init); void cpu_arm920_proc_fin(void); __ADDRESSABLE(cpu_arm920_proc_fin); -void cpu_arm920_reset(void); +void cpu_arm920_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm920_reset); int cpu_arm920_do_idle(void); __ADDRESSABLE(cpu_arm920_do_idle); @@ -102,7 +102,7 @@ void cpu_arm922_proc_init(void); __ADDRESSABLE(cpu_arm922_proc_init); void cpu_arm922_proc_fin(void); __ADDRESSABLE(cpu_arm922_proc_fin); -void cpu_arm922_reset(void); +void cpu_arm922_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm922_reset); int cpu_arm922_do_idle(void); __ADDRESSABLE(cpu_arm922_do_idle); @@ -119,7 +119,7 @@ void cpu_arm925_proc_init(void); __ADDRESSABLE(cpu_arm925_proc_init); void cpu_arm925_proc_fin(void); __ADDRESSABLE(cpu_arm925_proc_fin); -void cpu_arm925_reset(void); +void cpu_arm925_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm925_reset); int cpu_arm925_do_idle(void); __ADDRESSABLE(cpu_arm925_do_idle); @@ -159,7 +159,7 @@ void cpu_arm940_proc_init(void); __ADDRESSABLE(cpu_arm940_proc_init); void cpu_arm940_proc_fin(void); __ADDRESSABLE(cpu_arm940_proc_fin); -void cpu_arm940_reset(void); +void cpu_arm940_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm940_reset); int cpu_arm940_do_idle(void); __ADDRESSABLE(cpu_arm940_do_idle); @@ -174,7 +174,7 @@ void cpu_arm946_proc_init(void); __ADDRESSABLE(cpu_arm946_proc_init); void cpu_arm946_proc_fin(void); __ADDRESSABLE(cpu_arm946_proc_fin); -void cpu_arm946_reset(void); +void cpu_arm946_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_arm946_reset); int cpu_arm946_do_idle(void); __ADDRESSABLE(cpu_arm946_do_idle); @@ -429,7 +429,7 @@ void cpu_v7_proc_init(void); __ADDRESSABLE(cpu_v7_proc_init); void cpu_v7_proc_fin(void); __ADDRESSABLE(cpu_v7_proc_fin); -void cpu_v7_reset(void); +void cpu_v7_reset(unsigned long addr, bool hvc); __ADDRESSABLE(cpu_v7_reset); int cpu_v7_do_idle(void); __ADDRESSABLE(cpu_v7_do_idle); -- cgit From 675e979db473d08be346a3190c5f0db095a57153 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 7 Jun 2024 16:43:58 +0200 Subject: cxl/events: Use a common struct for DRAM and General Media events cxl_event_common was an unfortunate naming choice and caused confusion with the existing Common Event Record. Furthermore, its fields didn't map all the common information between DRAM and General Media Events. Remove cxl_event_common and introduce cxl_event_media_hdr to record common information between DRAM and General Media events. cxl_event_media_hdr, which is embedded in both cxl_event_gen_media and cxl_event_dram, leverages the commonalities between the two events to simplify their respective handling. Suggested-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240607144423.48681-1-fabio.m.de.francesco@linux.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 2 +- drivers/cxl/core/trace.h | 32 +++++++++++++------------- include/linux/cxl-event.h | 45 ++++++++++++++++-------------------- tools/testing/cxl/test/mem.c | 54 ++++++++++++++++++++++++-------------------- 4 files changed, 65 insertions(+), 68 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 2626f3fff201..a08f050cc1ca 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -875,7 +875,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, guard(rwsem_read)(&cxl_region_rwsem); guard(rwsem_read)(&cxl_dpa_rwsem); - dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK; + dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); if (cxlr) hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index ee5cd4eb2f16..6d8b71d8f6c4 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -340,23 +340,23 @@ TRACE_EVENT(cxl_general_media, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); __entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID; /* General Media */ - __entry->dpa = le64_to_cpu(rec->phys_addr); + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; /* Mask after flags have been parsed */ __entry->dpa &= CXL_DPA_MASK; - __entry->descriptor = rec->descriptor; - __entry->type = rec->type; - __entry->transaction_type = rec->transaction_type; - __entry->channel = rec->channel; - __entry->rank = rec->rank; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; __entry->device = get_unaligned_le24(rec->device); memcpy(__entry->comp_id, &rec->component_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE); - __entry->validity_flags = get_unaligned_le16(&rec->validity_flags); + __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags); __entry->hpa = hpa; if (cxlr) { __assign_str(region_name); @@ -440,19 +440,19 @@ TRACE_EVENT(cxl_dram, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); __entry->hdr_uuid = CXL_EVENT_DRAM_UUID; /* DRAM */ - __entry->dpa = le64_to_cpu(rec->phys_addr); + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; __entry->dpa &= CXL_DPA_MASK; - __entry->descriptor = rec->descriptor; - __entry->type = rec->type; - __entry->transaction_type = rec->transaction_type; - __entry->validity_flags = get_unaligned_le16(rec->validity_flags); - __entry->channel = rec->channel; - __entry->rank = rec->rank; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags); + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask); __entry->bank_group = rec->bank_group; __entry->bank = rec->bank; diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 60b25020281f..0bea1afbd747 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -21,6 +21,21 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; +struct cxl_event_media_hdr { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + /* + * The meaning of Validity Flags from bit 2 is + * different across DRAM and General Media records + */ + u8 validity_flags[2]; + u8 channel; + u8 rank; +} __packed; + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct cxl_event_generic { struct cxl_event_record_hdr hdr; @@ -33,14 +48,7 @@ struct cxl_event_generic { */ #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 struct cxl_event_gen_media { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 device[3]; u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; u8 reserved[46]; @@ -52,14 +60,7 @@ struct cxl_event_gen_media { */ #define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 struct cxl_event_dram { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 nibble_mask[3]; u8 bank_group; u8 bank; @@ -95,21 +96,13 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; -/* - * General Media or DRAM Event Common Fields - * - provides common access to phys_addr - */ -struct cxl_event_common { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; -} __packed; - union cxl_event { struct cxl_event_generic generic; struct cxl_event_gen_media gen_media; struct cxl_event_dram dram; struct cxl_event_mem_module mem_module; - struct cxl_event_common common; + /* dram & gen_media event header */ + struct cxl_event_media_hdr media_hdr; } __packed; /* diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index eaf091a3d331..94de2cf60f4f 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -385,19 +385,21 @@ struct cxl_test_gen_media { struct cxl_test_gen_media gen_media = { .id = CXL_EVENT_GEN_MEDIA_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_gen_media), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_gen_media), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x2000), + .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, + .transaction_type = CXL_GMER_TRANS_HOST_WRITE, + /* .validity_flags = */ + .channel = 1, + .rank = 30, }, - .phys_addr = cpu_to_le64(0x2000), - .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, - .transaction_type = CXL_GMER_TRANS_HOST_WRITE, - /* .validity_flags = */ - .channel = 1, - .rank = 30 }, }; @@ -409,18 +411,20 @@ struct cxl_test_dram { struct cxl_test_dram dram = { .id = CXL_EVENT_DRAM_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_dram), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_dram), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x8000), + .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, + .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, + /* .validity_flags = */ + .channel = 1, }, - .phys_addr = cpu_to_le64(0x8000), - .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, - .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, - /* .validity_flags = */ - .channel = 1, .bank_group = 5, .bank = 2, .column = {0xDE, 0xAD}, @@ -474,11 +478,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds, static void cxl_mock_add_event_logs(struct mock_event_store *mes) { put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK, - &gen_media.rec.validity_flags); + &gen_media.rec.media_hdr.validity_flags); put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP | CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN, - &dram.rec.validity_flags); + &dram.rec.media_hdr.validity_flags); mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed); mes_add_event(mes, CXL_EVENT_TYPE_INFO, -- cgit From a0caa19711ceb54c34368f66a746844fb03fde6c Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 7 Jun 2024 06:57:15 -0700 Subject: cxl: add missing MODULE_DESCRIPTION() macros make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/core/cxl_core.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_pci.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_mem.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_acpi.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_pmem.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_port.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240607-md-drivers-cxl-v2-1-0c61d95ee7a7@quicinc.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 1 + drivers/cxl/core/port.c | 1 + drivers/cxl/mem.c | 1 + drivers/cxl/pci.c | 1 + drivers/cxl/pmem.c | 1 + drivers/cxl/port.c | 1 + 6 files changed, 6 insertions(+) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 571069863c62..e51315ea4a6a 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -921,6 +921,7 @@ static void __exit cxl_acpi_exit(void) /* load before dax_hmem sees 'Soft Reserved' CXL ranges */ subsys_initcall(cxl_acpi_init); module_exit(cxl_acpi_exit); +MODULE_DESCRIPTION("CXL ACPI: Platform Support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_IMPORT_NS(ACPI); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 887ed6e358fb..e31c5fcd9bf8 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2356,5 +2356,6 @@ static void cxl_core_exit(void) subsys_initcall(cxl_core_init); module_exit(cxl_core_exit); +MODULE_DESCRIPTION("CXL: Core Compute Express Link support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 0c79d9ce877c..1afb0e78082b 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -252,6 +252,7 @@ static struct cxl_driver cxl_mem_driver = { module_cxl_driver(cxl_mem_driver); +MODULE_DESCRIPTION("CXL: Memory Expansion"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index e53646e9f2fb..4be35dc22202 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1066,5 +1066,6 @@ static void __exit cxl_pci_driver_exit(void) module_init(cxl_pci_driver_init); module_exit(cxl_pci_driver_exit); +MODULE_DESCRIPTION("CXL: PCI manageability"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index 2ecdaee63021..4ef93da22335 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -453,6 +453,7 @@ static __exit void cxl_pmem_exit(void) cxl_driver_unregister(&cxl_nvdimm_bridge_driver); } +MODULE_DESCRIPTION("CXL PMEM: Persistent Memory Support"); MODULE_LICENSE("GPL v2"); module_init(cxl_pmem_init); module_exit(cxl_pmem_exit); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 97c21566677a..d7d5d982ce69 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -209,6 +209,7 @@ static struct cxl_driver cxl_port_driver = { }; module_cxl_driver(cxl_port_driver); +MODULE_DESCRIPTION("CXL: Port enumeration and services"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_PORT); -- cgit From a3483ee7e6a7f2d12b5950246f4e0ef94f4a5df0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 18 Jun 2024 16:46:37 +0800 Subject: cxl/region: Fix a race condition in memory hotplug notifier In the memory hotplug notifier function of the CXL region, cxl_region_perf_attrs_callback(), the node ID is obtained by checking the host address range of the region. However, the address range information is not available when the region is registered in devm_cxl_add_region(). Additionally, this information may be removed or added under the protection of cxl_region_rwsem during runtime. If the memory notifier is called for nodes other than that backed by the region, a race condition may occur, potentially leading to a NULL dereference or an invalid address range. The race condition is addressed by checking the availability of the address range information under the protection of cxl_region_rwsem. To enhance code readability and use guard(), the relevant code has been moved into a newly added function: cxl_region_nid(). Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") Signed-off-by: Huang, Ying Cc: Dan Williams Cc: Alison Schofield Cc: Andrew Morton Cc: Jonathan Cameron Cc: Dave Jiang Cc: Bharata B Rao Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Davidlohr Bueso Cc: Vishal Verma Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240618084639.1419629-2-ying.huang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3c2b6144be23..51aeef2c012c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2304,14 +2304,25 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) return true; } +static int cxl_region_nid(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_decoder *cxld; + + guard(rwsem_read)(&cxl_region_rwsem); + cxled = p->targets[0]; + if (!cxled) + return NUMA_NO_NODE; + cxld = &cxled->cxld; + return phys_to_target_node(cxld->hpa_range.start); +} + static int cxl_region_perf_attrs_callback(struct notifier_block *nb, unsigned long action, void *arg) { struct cxl_region *cxlr = container_of(nb, struct cxl_region, memory_notifier); - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = p->targets[0]; - struct cxl_decoder *cxld = &cxled->cxld; struct memory_notify *mnb = arg; int nid = mnb->status_change_nid; int region_nid; @@ -2319,7 +2330,7 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid == NUMA_NO_NODE || action != MEM_ONLINE) return NOTIFY_DONE; - region_nid = phys_to_target_node(cxld->hpa_range.start); + region_nid = cxl_region_nid(cxlr); if (nid != region_nid) return NOTIFY_DONE; -- cgit From 643e8e3e65290fdfe507bb23fa524c77e1345af3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 18 Jun 2024 16:46:38 +0800 Subject: cxl/region: Support to calculate memory tier abstract distance An abstract distance value must be assigned by the driver that makes the memory available to the system. It reflects relative performance and is used to place memory nodes backed by CXL regions in the appropriate memory tiers allowing promotion/demotion within the existing memory tiering mechanism. The abstract distance is calculated based on the memory access latency and bandwidth of CXL regions. Signed-off-by: Huang, Ying Acked-by: Dan Williams Cc: Alison Schofield Cc: Andrew Morton Cc: Jonathan Cameron Cc: Dave Jiang Cc: Bharata B Rao Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Davidlohr Bueso Cc: Vishal Verma Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240618084639.1419629-3-ying.huang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 27 +++++++++++++++++++++++++++ drivers/cxl/cxl.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 51aeef2c012c..dc15ceba7ab7 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -2228,6 +2229,7 @@ static void unregister_region(void *_cxlr) int i; unregister_memory_notifier(&cxlr->memory_notifier); + unregister_mt_adistance_algorithm(&cxlr->adist_notifier); device_del(&cxlr->dev); /* @@ -2340,6 +2342,27 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, return NOTIFY_OK; } +static int cxl_region_calculate_adistance(struct notifier_block *nb, + unsigned long nid, void *data) +{ + struct cxl_region *cxlr = container_of(nb, struct cxl_region, + adist_notifier); + struct access_coordinate *perf; + int *adist = data; + int region_nid; + + region_nid = cxl_region_nid(cxlr); + if (nid != region_nid) + return NOTIFY_OK; + + perf = &cxlr->coord[ACCESS_COORDINATE_CPU]; + + if (mt_perf_to_adistance(perf, adist)) + return NOTIFY_OK; + + return NOTIFY_STOP; +} + /** * devm_cxl_add_region - Adds a region to a decoder * @cxlrd: root decoder @@ -2382,6 +2405,10 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, cxlr->memory_notifier.priority = CXL_CALLBACK_PRI; register_memory_notifier(&cxlr->memory_notifier); + cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance; + cxlr->adist_notifier.priority = 100; + register_mt_adistance_algorithm(&cxlr->adist_notifier); + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); if (rc) return ERR_PTR(rc); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 603c0120cff8..f46252373159 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -522,6 +522,7 @@ struct cxl_region_params { * @params: active + config params for the region * @coord: QoS access coordinates for the region * @memory_notifier: notifier for setting the access coordinates to node + * @adist_notifier: notifier for calculating the abstract distance of node */ struct cxl_region { struct device dev; @@ -534,6 +535,7 @@ struct cxl_region { struct cxl_region_params params; struct access_coordinate coord[ACCESS_COORDINATE_MAX]; struct notifier_block memory_notifier; + struct notifier_block adist_notifier; }; struct cxl_nvdimm_bridge { -- cgit From f3d70720e92c24c22e0e63c2588636edba33eb1a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 18 Jun 2024 16:46:39 +0800 Subject: cxl/region: Simplify cxl_region_nid() The node ID of the region can be gotten via resource start address directly. This simplifies the implementation of cxl_region_nid(). Signed-off-by: Huang Ying Suggested-by: Alison Schofield Cc: Dan Williams Cc: Andrew Morton Cc: Jonathan Cameron Cc: Dave Jiang Cc: Bharata B Rao Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Davidlohr Bueso Cc: Vishal Verma Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240618084639.1419629-4-ying.huang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index dc15ceba7ab7..32473fddce03 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2309,15 +2309,13 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) static int cxl_region_nid(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled; - struct cxl_decoder *cxld; + struct resource *res; guard(rwsem_read)(&cxl_region_rwsem); - cxled = p->targets[0]; - if (!cxled) + res = p->res; + if (!res) return NUMA_NO_NODE; - cxld = &cxled->cxld; - return phys_to_target_node(cxld->hpa_range.start); + return phys_to_target_node(res->start); } static int cxl_region_perf_attrs_callback(struct notifier_block *nb, -- cgit From 67f9c312b0a7f4bc869376d2a68308e673235954 Mon Sep 17 00:00:00 2001 From: Aswin Unnikrishnan Date: Sun, 12 May 2024 11:23:20 +0000 Subject: rust: add example for `alias` argument in `module` macro documentation Add example for `alias` argument supported by `module` macro. `alias` accepts an array of alternate names for the module as string. Reviewed-by: Alice Ryhl Signed-off-by: Aswin Unnikrishnan Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240512112324.8514-1-aswinunni01@gmail.com Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 520eae5fd792..aa89b41fa10e 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -35,6 +35,7 @@ use proc_macro::TokenStream; /// author: "Rust for Linux Contributors", /// description: "My very own kernel module!", /// license: "GPL", +/// alias: ["alternate_module_name"], /// } /// /// struct MyModule; -- cgit From 63249a070eb5187d5caec995d171b53e374a0741 Mon Sep 17 00:00:00 2001 From: Aswin Unnikrishnan Date: Sun, 12 May 2024 11:23:21 +0000 Subject: rust: fix datatype in docs for `module` macro arguments Remove the mention of byte array as datatype for `module` macro arguments since the arguments are defined as string, and `alias` is a string array. Signed-off-by: Aswin Unnikrishnan Reviewed-by: Vincenzo Palazzo Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240512112324.8514-2-aswinunni01@gmail.com Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index aa89b41fa10e..5214e07367c5 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -58,11 +58,11 @@ use proc_macro::TokenStream; /// /// # Supported argument types /// - `type`: type which implements the [`Module`] trait (required). -/// - `name`: byte array of the name of the kernel module (required). -/// - `author`: byte array of the author of the kernel module. -/// - `description`: byte array of the description of the kernel module. -/// - `license`: byte array of the license of the kernel module (required). -/// - `alias`: byte array of alias name of the kernel module. +/// - `name`: ASCII string literal of the name of the kernel module (required). +/// - `author`: string literal of the author of the kernel module. +/// - `description`: string literal of the description of the kernel module. +/// - `license`: ASCII string literal of the license of the kernel module (required). +/// - `alias`: array of ASCII string literals of the alias names of the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) -- cgit From 549d3c2ffbea44fe123a67983fd8b15ab6989d8d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 1 May 2024 21:35:48 +0900 Subject: rust: add 'firmware' field support to module! macro This adds 'firmware' field support to module! macro, corresponds to MODULE_FIRMWARE macro. You can specify the file names of binary firmware that the kernel module requires. The information is embedded in the modinfo section of the kernel module. For example, a tool to build an initramfs uses this information to put the firmware files into the initramfs image. Signed-off-by: FUJITA Tomonori Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240501123548.51769-1-fujita.tomonori@gmail.com Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 32 ++++++++++++++++++++++++++++++++ rust/macros/module.rs | 18 ++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 5214e07367c5..6dcc72aff111 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -56,6 +56,36 @@ use proc_macro::TokenStream; /// } /// ``` /// +/// ## Firmware +/// +/// The following example shows how to declare a kernel module that needs +/// to load binary firmware files. You need to specify the file names of +/// the firmware in the `firmware` field. The information is embedded +/// in the `modinfo` section of the kernel module. For example, a tool to +/// build an initramfs uses this information to put the firmware files into +/// the initramfs image. +/// +/// ```ignore +/// use kernel::prelude::*; +/// +/// module!{ +/// type: MyDeviceDriverModule, +/// name: "my_device_driver_module", +/// author: "Rust for Linux Contributors", +/// description: "My device driver requires firmware", +/// license: "GPL", +/// firmware: ["my_device_firmware1.bin", "my_device_firmware2.bin"], +/// } +/// +/// struct MyDeviceDriverModule; +/// +/// impl kernel::Module for MyDeviceDriverModule { +/// fn init() -> Result { +/// Ok(Self) +/// } +/// } +/// ``` +/// /// # Supported argument types /// - `type`: type which implements the [`Module`] trait (required). /// - `name`: ASCII string literal of the name of the kernel module (required). @@ -63,6 +93,8 @@ use proc_macro::TokenStream; /// - `description`: string literal of the description of the kernel module. /// - `license`: ASCII string literal of the license of the kernel module (required). /// - `alias`: array of ASCII string literals of the alias names of the kernel module. +/// - `firmware`: array of ASCII string literals of the firmware files of +/// the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) diff --git a/rust/macros/module.rs b/rust/macros/module.rs index acd0393b5095..411dc103d82e 100644 --- a/rust/macros/module.rs +++ b/rust/macros/module.rs @@ -97,14 +97,22 @@ struct ModuleInfo { author: Option, description: Option, alias: Option>, + firmware: Option>, } impl ModuleInfo { fn parse(it: &mut token_stream::IntoIter) -> Self { let mut info = ModuleInfo::default(); - const EXPECTED_KEYS: &[&str] = - &["type", "name", "author", "description", "license", "alias"]; + const EXPECTED_KEYS: &[&str] = &[ + "type", + "name", + "author", + "description", + "license", + "alias", + "firmware", + ]; const REQUIRED_KEYS: &[&str] = &["type", "name", "license"]; let mut seen_keys = Vec::new(); @@ -131,6 +139,7 @@ impl ModuleInfo { "description" => info.description = Some(expect_string(it)), "license" => info.license = expect_string_ascii(it), "alias" => info.alias = Some(expect_string_array(it)), + "firmware" => info.firmware = Some(expect_string_array(it)), _ => panic!( "Unknown key \"{}\". Valid keys are: {:?}.", key, EXPECTED_KEYS @@ -186,6 +195,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream { modinfo.emit("alias", &alias); } } + if let Some(firmware) = info.firmware { + for fw in firmware { + modinfo.emit("firmware", &fw); + } + } // Built-in modules also export the `file` modinfo string. let file = -- cgit From 526c539452cec6e7e65776d5807e6c66dd65d636 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 17 May 2024 19:06:15 +0200 Subject: docs: rust: introduce the new kernel.org LLVM+Rust toolchains These combined LLVM+Rust toolchains are now available, thanks to Nathan Chancellor (ClangBuiltLinux). Thus introduce them in the Rust Quick Start guide. Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240517170615.377786-1-ojeda@kernel.org Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index cc3f11e0d441..6fe69a601134 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -16,6 +16,13 @@ under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However, at the time of writing, they are likely not to be recent enough unless the distribution tracks the latest releases. +Prebuilt stable versions of LLVM+Rust are provided on `kernel.org +`_. These are the same slim and fast +LLVM toolchains from :ref:`Getting LLVM ` with versions of Rust +added to them that Rust for Linux supports, depending on the Linux version. Two +sets are provided: the "latest LLVM" and "matching LLVM" (please see the link +for more information). + To easily check whether the requirements are met, the following target can be used:: -- cgit From fe7d9d804337180d2377f0654537970c6cd863f7 Mon Sep 17 00:00:00 2001 From: Roland Xu Date: Thu, 23 May 2024 00:08:46 +0800 Subject: rust: kernel: make impl_has_work compatible with more generics Make the impl_has_work macro compatible with more complex generics such as lifetimes and const generic arguments. Signed-off-by: Roland Xu Link: https://lore.kernel.org/r/ME0P282MB4890A180B99490CC65EF64FDCCEB2@ME0P282MB4890.AUSP282.PROD.OUTLOOK.COM Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1077 [ Wrapped message to 72 columns. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/workqueue.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 1cec63a2aea8..553a5cba2adc 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -482,24 +482,26 @@ pub unsafe trait HasWork { /// use kernel::sync::Arc; /// use kernel::workqueue::{self, impl_has_work, Work}; /// -/// struct MyStruct { -/// work_field: Work, +/// struct MyStruct<'a, T, const N: usize> { +/// work_field: Work, 17>, +/// f: fn(&'a [T; N]), /// } /// /// impl_has_work! { -/// impl HasWork for MyStruct { self.work_field } +/// impl{'a, T, const N: usize} HasWork, 17> +/// for MyStruct<'a, T, N> { self.work_field } /// } /// ``` #[macro_export] macro_rules! impl_has_work { - ($(impl$(<$($implarg:ident),*>)? + ($(impl$({$($generics:tt)*})? HasWork<$work_type:ty $(, $id:tt)?> - for $self:ident $(<$($selfarg:ident),*>)? + for $self:ty { self.$field:ident } )*) => {$( // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right // type. - unsafe impl$(<$($implarg),*>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self $(<$($selfarg),*>)? { + unsafe impl$(<$($generics)+>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self { const OFFSET: usize = ::core::mem::offset_of!(Self, $field) as usize; #[inline] @@ -515,7 +517,7 @@ macro_rules! impl_has_work { pub use impl_has_work; impl_has_work! { - impl HasWork for ClosureWork { self.work } + impl{T} HasWork for ClosureWork { self.work } } unsafe impl WorkItemPointer for Arc -- cgit From 9ffc80c819739ab60c42223c46b7351cec6a0e97 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 28 May 2024 18:35:02 +0200 Subject: kbuild: rust: remove now-unneeded `rusttest` custom sysroot handling Since we dropped our custom `alloc` in commit 9d0441bab775 ("rust: alloc: remove our fork of the `alloc` crate"), there is no need anymore to keep the custom sysroot hack. Thus delete it, which makes the target way simpler and faster too. This also means we are not using Cargo for anything at the moment, and that no download is required anymore, so update the main `Makefile` and the documentation accordingly. Link: https://lore.kernel.org/r/20240528163502.411600-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 14 -------- Documentation/rust/testing.rst | 5 ++- Makefile | 3 +- rust/Makefile | 70 +++++--------------------------------- 4 files changed, 11 insertions(+), 81 deletions(-) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 6fe69a601134..ac2f16288458 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -171,20 +171,6 @@ can be installed manually:: The standalone installers also come with ``clippy``. -cargo -***** - -``cargo`` is the Rust native build system. It is currently required to run -the tests since it is used to build a custom standard library that contains -the facilities provided by the custom ``alloc`` in the kernel. The tests can -be run using the ``rusttest`` Make target. - -If ``rustup`` is being used, all the profiles already install the tool, -thus nothing needs to be done. - -The standalone installers also come with ``cargo``. - - rustdoc ******* diff --git a/Documentation/rust/testing.rst b/Documentation/rust/testing.rst index acfd0c2be48d..568b71b415a4 100644 --- a/Documentation/rust/testing.rst +++ b/Documentation/rust/testing.rst @@ -131,9 +131,8 @@ Additionally, there are the ``#[test]`` tests. These can be run using the make LLVM=1 rusttest -This requires the kernel ``.config`` and downloads external repositories. It -runs the ``#[test]`` tests on the host (currently) and thus is fairly limited in -what these tests can test. +This requires the kernel ``.config``. It runs the ``#[test]`` tests on the host +(currently) and thus is fairly limited in what these tests can test. The Kselftests -------------- diff --git a/Makefile b/Makefile index b25b5b44af10..d4134a9f32b6 100644 --- a/Makefile +++ b/Makefile @@ -507,7 +507,6 @@ RUSTDOC = rustdoc RUSTFMT = rustfmt CLIPPY_DRIVER = clippy-driver BINDGEN = bindgen -CARGO = cargo PAHOLE = pahole RESOLVE_BTFIDS = $(objtree)/tools/bpf/resolve_btfids/resolve_btfids LEX = flex @@ -601,7 +600,7 @@ endif export RUSTC_BOOTSTRAP := 1 export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG -export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN CARGO +export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN export HOSTRUSTC KBUILD_HOSTRUSTFLAGS export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX diff --git a/rust/Makefile b/rust/Makefile index f70d5e244fee..385378311322 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -44,17 +44,10 @@ rustc_sysroot := $(shell MAKEFLAGS= $(RUSTC) $(rust_flags) --print sysroot) rustc_host_target := $(shell $(RUSTC) --version --verbose | grep -F 'host: ' | cut -d' ' -f2) RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library -ifeq ($(quiet),silent_) -cargo_quiet=-q +ifneq ($(quiet),) rust_test_quiet=-q rustdoc_test_quiet=--test-args -q rustdoc_test_kernel_quiet=>/dev/null -else ifeq ($(quiet),quiet_) -rust_test_quiet=-q -rustdoc_test_quiet=--test-args -q -rustdoc_test_kernel_quiet=>/dev/null -else -cargo_quiet=--verbose endif core-cfgs = \ @@ -135,22 +128,21 @@ quiet_cmd_rustc_test_library = RUSTC TL $< @$(objtree)/include/generated/rustc_cfg $(rustc_target_flags) \ --crate-type $(if $(rustc_test_library_proc),proc-macro,rlib) \ --out-dir $(objtree)/$(obj)/test --cfg testlib \ - --sysroot $(objtree)/$(obj)/test/sysroot \ -L$(objtree)/$(obj)/test \ --crate-name $(subst rusttest-,,$(subst rusttestlib-,,$@)) $< -rusttestlib-build_error: $(src)/build_error.rs rusttest-prepare FORCE +rusttestlib-build_error: $(src)/build_error.rs FORCE +$(call if_changed,rustc_test_library) rusttestlib-macros: private rustc_target_flags = --extern proc_macro rusttestlib-macros: private rustc_test_library_proc = yes -rusttestlib-macros: $(src)/macros/lib.rs rusttest-prepare FORCE +rusttestlib-macros: $(src)/macros/lib.rs FORCE +$(call if_changed,rustc_test_library) -rusttestlib-bindings: $(src)/bindings/lib.rs rusttest-prepare FORCE +rusttestlib-bindings: $(src)/bindings/lib.rs FORCE +$(call if_changed,rustc_test_library) -rusttestlib-uapi: $(src)/uapi/lib.rs rusttest-prepare FORCE +rusttestlib-uapi: $(src)/uapi/lib.rs FORCE +$(call if_changed,rustc_test_library) quiet_cmd_rustdoc_test = RUSTDOC T $< @@ -159,7 +151,7 @@ quiet_cmd_rustdoc_test = RUSTDOC T $< $(RUSTDOC) --test $(rust_common_flags) \ @$(objtree)/include/generated/rustc_cfg \ $(rustc_target_flags) $(rustdoc_test_target_flags) \ - --sysroot $(objtree)/$(obj)/test/sysroot $(rustdoc_test_quiet) \ + $(rustdoc_test_quiet) \ -L$(objtree)/$(obj)/test --output $(rustdoc_output) \ --crate-name $(subst rusttest-,,$@) $< @@ -192,7 +184,6 @@ quiet_cmd_rustc_test = RUSTC T $< $(RUSTC) --test $(rust_common_flags) \ @$(objtree)/include/generated/rustc_cfg \ $(rustc_target_flags) --out-dir $(objtree)/$(obj)/test \ - --sysroot $(objtree)/$(obj)/test/sysroot \ -L$(objtree)/$(obj)/test \ --crate-name $(subst rusttest-,,$@) $<; \ $(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \ @@ -200,60 +191,15 @@ quiet_cmd_rustc_test = RUSTC T $< rusttest: rusttest-macros rusttest-kernel -# This prepares a custom sysroot with our custom `alloc` instead of -# the standard one. -# -# This requires several hacks: -# - Unlike `core` and `alloc`, `std` depends on more than a dozen crates, -# including third-party crates that need to be downloaded, plus custom -# `build.rs` steps. Thus hardcoding things here is not maintainable. -# - `cargo` knows how to build the standard library, but it is an unstable -# feature so far (`-Zbuild-std`). -# - `cargo` only considers the use case of building the standard library -# to use it in a given package. Thus we need to create a dummy package -# and pick the generated libraries from there. -# - The usual ways of modifying the dependency graph in `cargo` do not seem -# to apply for the `-Zbuild-std` steps, thus we have to mislead it -# by modifying the sources in the sysroot. -# - To avoid messing with the user's Rust installation, we create a clone -# of the sysroot. However, `cargo` ignores `RUSTFLAGS` in the `-Zbuild-std` -# steps, thus we use a wrapper binary passed via `RUSTC` to pass the flag. -# -# In the future, we hope to avoid the whole ordeal by either: -# - Making the `test` crate not depend on `std` (either improving upstream -# or having our own custom crate). -# - Making the tests run in kernel space (requires the previous point). -# - Making `std` and friends be more like a "normal" crate, so that -# `-Zbuild-std` and related hacks are not needed. -quiet_cmd_rustsysroot = RUSTSYSROOT - cmd_rustsysroot = \ - rm -rf $(objtree)/$(obj)/test; \ - mkdir -p $(objtree)/$(obj)/test; \ - cp -a $(rustc_sysroot) $(objtree)/$(obj)/test/sysroot; \ - echo '\#!/bin/sh' > $(objtree)/$(obj)/test/rustc_sysroot; \ - echo "$(RUSTC) --sysroot=$(abspath $(objtree)/$(obj)/test/sysroot) \"\$$@\"" \ - >> $(objtree)/$(obj)/test/rustc_sysroot; \ - chmod u+x $(objtree)/$(obj)/test/rustc_sysroot; \ - $(CARGO) -q new $(objtree)/$(obj)/test/dummy; \ - RUSTC=$(objtree)/$(obj)/test/rustc_sysroot $(CARGO) $(cargo_quiet) \ - test -Zbuild-std --target $(rustc_host_target) \ - --manifest-path $(objtree)/$(obj)/test/dummy/Cargo.toml; \ - rm $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib/*; \ - cp $(objtree)/$(obj)/test/dummy/target/$(rustc_host_target)/debug/deps/* \ - $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib - -rusttest-prepare: FORCE - +$(call if_changed,rustsysroot) - rusttest-macros: private rustc_target_flags = --extern proc_macro rusttest-macros: private rustdoc_test_target_flags = --crate-type proc-macro -rusttest-macros: $(src)/macros/lib.rs rusttest-prepare FORCE +rusttest-macros: $(src)/macros/lib.rs FORCE +$(call if_changed,rustc_test) +$(call if_changed,rustdoc_test) rusttest-kernel: private rustc_target_flags = --extern alloc \ --extern build_error --extern macros --extern bindings --extern uapi -rusttest-kernel: $(src)/kernel/lib.rs rusttest-prepare \ +rusttest-kernel: $(src)/kernel/lib.rs \ rusttestlib-build_error rusttestlib-macros rusttestlib-bindings \ rusttestlib-uapi FORCE +$(call if_changed,rustc_test) -- cgit From d3ee24cce4e558041cbf21a5bc9cd9a4b1b3c7e7 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 29 May 2024 08:34:52 +0000 Subject: rust: alloc: fix typo in docs for GFP_NOWAIT Fix a typo in alloc.rs by replacing Ror with For. Signed-off-by: Alice Ryhl Reviewed-by: Vincenzo Palazzo Link: https://lore.kernel.org/r/20240529083452.779865-1-aliceryhl@google.com Signed-off-by: Miguel Ojeda --- rust/kernel/alloc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 531b5e471cb1..c44995914f75 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -66,7 +66,7 @@ pub mod flags { /// The same as [`GFP_KERNEL`], except the allocation is accounted to kmemcg. pub const GFP_KERNEL_ACCOUNT: Flags = Flags(bindings::GFP_KERNEL_ACCOUNT); - /// Ror kernel allocations that should not stall for direct reclaim, start physical IO or + /// For kernel allocations that should not stall for direct reclaim, start physical IO or /// use any filesystem callback. It is very likely to fail to allocate memory, even for very /// small allocations. pub const GFP_NOWAIT: Flags = Flags(bindings::GFP_NOWAIT); -- cgit From ab44079e2869c9f1743d167d5f9b5befe375b6d9 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 7 Jun 2024 08:23:41 +0000 Subject: rust: alloc: add __GFP_HIGHMEM flag Make it possible to allocate memory that doesn't need to mapped into the kernel's address space. This flag is useful together with Page::alloc_page [1]. Rust Binder needs this for the memory that holds incoming transactions for each process. Each process will have a few megabytes of memory allocated with this flag, which is mapped into the process using vm_insert_page. When the kernel copies data for an incoming transaction into a process's memory region, it will use kmap_local_page to temporarily map pages that are being modified. There is no need for them to take up address space in the kernel when the kernel is not writing an incoming transaction into the page. Link: https://lore.kernel.org/all/20240528-alice-mm-v7-4-78222c31b8f4@google.com/ [1] Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240607-highmem-v1-1-d18c5ca4072f@google.com Signed-off-by: Miguel Ojeda --- rust/bindings/bindings_helper.h | 1 + rust/kernel/alloc.rs | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index ddb5644d4fd9..52a1412338ef 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -25,3 +25,4 @@ const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; const gfp_t RUST_CONST_HELPER_GFP_NOWAIT = GFP_NOWAIT; const gfp_t RUST_CONST_HELPER___GFP_ZERO = __GFP_ZERO; +const gfp_t RUST_CONST_HELPER___GFP_HIGHMEM = ___GFP_HIGHMEM; diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index c44995914f75..396fe5a85a8f 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -52,6 +52,14 @@ pub mod flags { /// This is normally or'd with other flags. pub const __GFP_ZERO: Flags = Flags(bindings::__GFP_ZERO); + /// Allow the allocation to be in high memory. + /// + /// Allocations in high memory may not be mapped into the kernel's address space, so this can't + /// be used with `kmalloc` and other similar methods. + /// + /// This is normally or'd with other flags. + pub const __GFP_HIGHMEM: Flags = Flags(bindings::__GFP_HIGHMEM); + /// Users can not sleep and need the allocation to succeed. /// /// A lower watermark is applied to allow access to "atomic reserves". The current -- cgit From b63c455d38be5f62a0665f3080c67334db5b4c41 Mon Sep 17 00:00:00 2001 From: Dirk Behme Date: Mon, 10 Jun 2024 14:23:32 +0200 Subject: docs: rust: no_std is used Using the #![no_std] attribute in the Rust kernel support is different to the default Rust usage. Mention this in the Documentation. Signed-off-by: Dirk Behme Reviewed-by: Trevor Gross Link: https://lore.kernel.org/r/20240610122332.3858571-1-dirk.behme@de.bosch.com [ Avoided breaking links in two lines. - Miguel ] Signed-off-by: Miguel Ojeda --- Documentation/rust/general-information.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst index 4bb6ac12d482..e3f388ef4ee4 100644 --- a/Documentation/rust/general-information.rst +++ b/Documentation/rust/general-information.rst @@ -7,6 +7,14 @@ This document contains useful information to know when working with the Rust support in the kernel. +``no_std`` +---------- + +The Rust support in the kernel can link only `core `_, +but not `std `_. Crates for use in the +kernel must opt into this behavior using the ``#![no_std]`` attribute. + + Code documentation ------------------ -- cgit From 6dc9d9ca9a728c8b30976d3ca353563ba0bfb949 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 27 Jun 2024 17:43:55 -0700 Subject: kbuild: rust-analyzer: better error handling 1) Provide a better error message for the "Rust not available" case. Without this patch, one gets various misleading messages, such as: "No rule to make target 'rust-analyzer'" Instead, run scripts/rust_is_available.sh directly, as a prerequisite, and let that script report the cause of any problems, as well as providing a link to the documentation. Thanks to Miguel Ojeda for the idea of just letting rust_is_available.sh report its results directly. The new output in the failure case looks like this: $ make rust-analyzer *** *** Rust compiler 'rustc' could not be found. *** *** *** Please see Documentation/rust/quick-start.rst for details *** on how to set up the Rust support. *** make[1]: *** [/kernel_work/linux-github/Makefile:1975: rust-analyzer] Error 1 make: *** [Makefile:240: __sub-make] Error 2 Reviewed-by: Finn Behrens Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20240628004356.1384486-2-jhubbard@nvidia.com [ Reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index d4134a9f32b6..de5be3845c70 100644 --- a/Makefile +++ b/Makefile @@ -1969,6 +1969,7 @@ tags TAGS cscope gtags: FORCE # IDE support targets PHONY += rust-analyzer rust-analyzer: + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust $@ # Script to generate missing namespace dependencies -- cgit From 5045b460843adf5bb31774b9464df5ae8e4da0d4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 27 Jun 2024 17:43:56 -0700 Subject: kbuild: rust-analyzer: improve comment documentation Replace the cryptic phrase ("IDE support targets") that initially appears to be about how to support old hard drives, with a few sentences that explain what "make rust-analyzer" provides. Signed-off-by: John Hubbard Reviewed-by: Finn Behrens Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20240628004356.1384486-3-jhubbard@nvidia.com [ Reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index de5be3845c70..fea263aaa492 100644 --- a/Makefile +++ b/Makefile @@ -1966,7 +1966,9 @@ quiet_cmd_tags = GEN $@ tags TAGS cscope gtags: FORCE $(call cmd,tags) -# IDE support targets +# Generate rust-project.json (a file that describes the structure of non-Cargo +# Rust projects) for rust-analyzer (an implementation of the Language Server +# Protocol). PHONY += rust-analyzer rust-analyzer: $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh -- cgit From 1b580e7b9ba2e5939c4b94da2cb4888605b39955 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Tue, 28 May 2024 14:58:02 +0000 Subject: rust: uaccess: add userspace pointers A pointer to an area in userspace memory, which can be either read-only or read-write. All methods on this struct are safe: attempting to read or write on bad addresses (either out of the bound of the slice or unmapped addresses) will return `EFAULT`. Concurrent access, *including data races to/from userspace memory*, is permitted, because fundamentally another userspace thread/process could always be modifying memory at the same time (in the same way that userspace Rust's `std::io` permits data races with the contents of files on disk). In the presence of a race, the exact byte values read/written are unspecified but the operation is well-defined. Kernelspace code should validate its copy of data after completing a read, and not expect that multiple reads of the same address will return the same value. These APIs are designed to make it difficult to accidentally write TOCTOU bugs. Every time you read from a memory location, the pointer is advanced by the length so that you cannot use that reader to read the same memory location twice. Preventing double-fetches avoids TOCTOU bugs. This is accomplished by taking `self` by value to prevent obtaining multiple readers on a given `UserSlice`, and the readers only permitting forward reads. If double-fetching a memory location is necessary for some reason, then that is done by creating multiple readers to the same memory location. Constructing a `UserSlice` performs no checks on the provided address and length, it can safely be constructed inside a kernel thread with no current userspace process. Reads and writes wrap the kernel APIs `copy_from_user` and `copy_to_user`, which check the memory map of the current process and enforce that the address range is within the user range (no additional calls to `access_ok` are needed). This code is based on something that was originally written by Wedson on the old rust branch. It was modified by Alice by removing the `IoBufferReader` and `IoBufferWriter` traits, and various other changes. Signed-off-by: Wedson Almeida Filho Reviewed-by: Benno Lossin Reviewed-by: Trevor Gross Reviewed-by: Boqun Feng Co-developed-by: Alice Ryhl Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240528-alice-mm-v7-1-78222c31b8f4@google.com [ Wrapped docs to 100 and added a few intra-doc links. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/helpers.c | 14 +++ rust/kernel/lib.rs | 1 + rust/kernel/uaccess.rs | 313 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 328 insertions(+) create mode 100644 rust/kernel/uaccess.rs diff --git a/rust/helpers.c b/rust/helpers.c index 2c37a0f5d7a8..d6abe4bd45d9 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -39,6 +39,20 @@ __noreturn void rust_helper_BUG(void) } EXPORT_SYMBOL_GPL(rust_helper_BUG); +unsigned long rust_helper_copy_from_user(void *to, const void __user *from, + unsigned long n) +{ + return copy_from_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_from_user); + +unsigned long rust_helper_copy_to_user(void __user *to, const void *from, + unsigned long n) +{ + return copy_to_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_to_user); + void rust_helper_mutex_lock(struct mutex *lock) { mutex_lock(lock); diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index fbd91a48ff8b..767026db068e 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -45,6 +45,7 @@ pub mod sync; pub mod task; pub mod time; pub mod types; +pub mod uaccess; pub mod workqueue; #[doc(hidden)] diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs new file mode 100644 index 000000000000..55f0d7ecfa3f --- /dev/null +++ b/rust/kernel/uaccess.rs @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Slices to user space memory regions. +//! +//! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h) + +use crate::{alloc::Flags, bindings, error::Result, prelude::*}; +use alloc::vec::Vec; +use core::ffi::{c_ulong, c_void}; +use core::mem::MaybeUninit; + +/// The type used for userspace addresses. +pub type UserPtr = usize; + +/// A pointer to an area in userspace memory, which can be either read-only or read-write. +/// +/// All methods on this struct are safe: attempting to read or write on bad addresses (either out of +/// the bound of the slice or unmapped addresses) will return [`EFAULT`]. Concurrent access, +/// *including data races to/from userspace memory*, is permitted, because fundamentally another +/// userspace thread/process could always be modifying memory at the same time (in the same way that +/// userspace Rust's [`std::io`] permits data races with the contents of files on disk). In the +/// presence of a race, the exact byte values read/written are unspecified but the operation is +/// well-defined. Kernelspace code should validate its copy of data after completing a read, and not +/// expect that multiple reads of the same address will return the same value. +/// +/// These APIs are designed to make it difficult to accidentally write TOCTOU (time-of-check to +/// time-of-use) bugs. Every time a memory location is read, the reader's position is advanced by +/// the read length and the next read will start from there. This helps prevent accidentally reading +/// the same location twice and causing a TOCTOU bug. +/// +/// Creating a [`UserSliceReader`] and/or [`UserSliceWriter`] consumes the `UserSlice`, helping +/// ensure that there aren't multiple readers or writers to the same location. +/// +/// If double-fetching a memory location is necessary for some reason, then that is done by creating +/// multiple readers to the same memory location, e.g. using [`clone_reader`]. +/// +/// # Examples +/// +/// Takes a region of userspace memory from the current process, and modify it by adding one to +/// every byte in the region. +/// +/// ```no_run +/// use alloc::vec::Vec; +/// use core::ffi::c_void; +/// use kernel::error::Result; +/// use kernel::uaccess::{UserPtr, UserSlice}; +/// +/// fn bytes_add_one(uptr: UserPtr, len: usize) -> Result<()> { +/// let (read, mut write) = UserSlice::new(uptr, len).reader_writer(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// for b in &mut buf { +/// *b = b.wrapping_add(1); +/// } +/// +/// write.write_slice(&buf)?; +/// Ok(()) +/// } +/// ``` +/// +/// Example illustrating a TOCTOU (time-of-check to time-of-use) bug. +/// +/// ```no_run +/// use alloc::vec::Vec; +/// use core::ffi::c_void; +/// use kernel::error::{code::EINVAL, Result}; +/// use kernel::uaccess::{UserPtr, UserSlice}; +/// +/// /// Returns whether the data in this region is valid. +/// fn is_valid(uptr: UserPtr, len: usize) -> Result { +/// let read = UserSlice::new(uptr, len).reader(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// todo!() +/// } +/// +/// /// Returns the bytes behind this user pointer if they are valid. +/// fn get_bytes_if_valid(uptr: UserPtr, len: usize) -> Result> { +/// if !is_valid(uptr, len)? { +/// return Err(EINVAL); +/// } +/// +/// let read = UserSlice::new(uptr, len).reader(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// // THIS IS A BUG! The bytes could have changed since we checked them. +/// // +/// // To avoid this kind of bug, don't call `UserSlice::new` multiple +/// // times with the same address. +/// Ok(buf) +/// } +/// ``` +/// +/// [`std::io`]: https://doc.rust-lang.org/std/io/index.html +/// [`clone_reader`]: UserSliceReader::clone_reader +pub struct UserSlice { + ptr: UserPtr, + length: usize, +} + +impl UserSlice { + /// Constructs a user slice from a raw pointer and a length in bytes. + /// + /// Constructing a [`UserSlice`] performs no checks on the provided address and length, it can + /// safely be constructed inside a kernel thread with no current userspace process. Reads and + /// writes wrap the kernel APIs `copy_from_user` and `copy_to_user`, which check the memory map + /// of the current process and enforce that the address range is within the user range (no + /// additional calls to `access_ok` are needed). Validity of the pointer is checked when you + /// attempt to read or write, not in the call to `UserSlice::new`. + /// + /// Callers must be careful to avoid time-of-check-time-of-use (TOCTOU) issues. The simplest way + /// is to create a single instance of [`UserSlice`] per user memory block as it reads each byte + /// at most once. + pub fn new(ptr: UserPtr, length: usize) -> Self { + UserSlice { ptr, length } + } + + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address. + pub fn read_all(self, buf: &mut Vec, flags: Flags) -> Result { + self.reader().read_all(buf, flags) + } + + /// Constructs a [`UserSliceReader`]. + pub fn reader(self) -> UserSliceReader { + UserSliceReader { + ptr: self.ptr, + length: self.length, + } + } + + /// Constructs a [`UserSliceWriter`]. + pub fn writer(self) -> UserSliceWriter { + UserSliceWriter { + ptr: self.ptr, + length: self.length, + } + } + + /// Constructs both a [`UserSliceReader`] and a [`UserSliceWriter`]. + /// + /// Usually when this is used, you will first read the data, and then overwrite it afterwards. + pub fn reader_writer(self) -> (UserSliceReader, UserSliceWriter) { + ( + UserSliceReader { + ptr: self.ptr, + length: self.length, + }, + UserSliceWriter { + ptr: self.ptr, + length: self.length, + }, + ) + } +} + +/// A reader for [`UserSlice`]. +/// +/// Used to incrementally read from the user slice. +pub struct UserSliceReader { + ptr: UserPtr, + length: usize, +} + +impl UserSliceReader { + /// Skip the provided number of bytes. + /// + /// Returns an error if skipping more than the length of the buffer. + pub fn skip(&mut self, num_skip: usize) -> Result { + // Update `self.length` first since that's the fallible part of this operation. + self.length = self.length.checked_sub(num_skip).ok_or(EFAULT)?; + self.ptr = self.ptr.wrapping_add(num_skip); + Ok(()) + } + + /// Create a reader that can access the same range of data. + /// + /// Reading from the clone does not advance the current reader. + /// + /// The caller should take care to not introduce TOCTOU issues, as described in the + /// documentation for [`UserSlice`]. + pub fn clone_reader(&self) -> UserSliceReader { + UserSliceReader { + ptr: self.ptr, + length: self.length, + } + } + + /// Returns the number of bytes left to be read from this reader. + /// + /// Note that even reading less than this number of bytes may fail. + pub fn len(&self) -> usize { + self.length + } + + /// Returns `true` if no data is available in the io buffer. + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Reads raw data from the user slice into a kernel buffer. + /// + /// For a version that uses `&mut [u8]`, please see [`UserSliceReader::read_slice`]. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error. + /// + /// # Guarantees + /// + /// After a successful call to this method, all bytes in `out` are initialized. + pub fn read_raw(&mut self, out: &mut [MaybeUninit]) -> Result { + let len = out.len(); + let out_ptr = out.as_mut_ptr().cast::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: `out_ptr` points into a mutable slice of length `len_ulong`, so we may write + // that many bytes to it. + let res = + unsafe { bindings::copy_from_user(out_ptr, self.ptr as *const c_void, len_ulong) }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } + + /// Reads raw data from the user slice into a kernel buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error. + pub fn read_slice(&mut self, out: &mut [u8]) -> Result { + // SAFETY: The types are compatible and `read_raw` doesn't write uninitialized bytes to + // `out`. + let out = unsafe { &mut *(out as *mut [u8] as *mut [MaybeUninit]) }; + self.read_raw(out) + } + + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address. + pub fn read_all(mut self, buf: &mut Vec, flags: Flags) -> Result { + let len = self.length; + VecExt::::reserve(buf, len, flags)?; + + // The call to `try_reserve` was successful, so the spare capacity is at least `len` bytes + // long. + self.read_raw(&mut buf.spare_capacity_mut()[..len])?; + + // SAFETY: Since the call to `read_raw` was successful, so the next `len` bytes of the + // vector have been initialized. + unsafe { buf.set_len(buf.len() + len) }; + Ok(()) + } +} + +/// A writer for [`UserSlice`]. +/// +/// Used to incrementally write into the user slice. +pub struct UserSliceWriter { + ptr: UserPtr, + length: usize, +} + +impl UserSliceWriter { + /// Returns the amount of space remaining in this buffer. + /// + /// Note that even writing less than this number of bytes may fail. + pub fn len(&self) -> usize { + self.length + } + + /// Returns `true` if no more data can be written to this buffer. + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Writes raw data to this user pointer from a kernel buffer. + /// + /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of + /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even + /// if it returns an error. + pub fn write_slice(&mut self, data: &[u8]) -> Result { + let len = data.len(); + let data_ptr = data.as_ptr().cast::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: `data_ptr` points into an immutable slice of length `len_ulong`, so we may read + // that many bytes from it. + let res = unsafe { bindings::copy_to_user(self.ptr as *mut c_void, data_ptr, len_ulong) }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } +} -- cgit From 1f9a8286bc0c3df7d789ea625d9d9db3d7779f2d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:58:03 +0000 Subject: uaccess: always export _copy_[from|to]_user with CONFIG_RUST Rust code needs to be able to access _copy_from_user and _copy_to_user so that it can skip the check_copy_size check in cases where the length is known at compile-time, mirroring the logic for when C code will skip check_copy_size. To do this, we ensure that exported versions of these methods are available when CONFIG_RUST is enabled. Alice has verified that this patch passes the CONFIG_TEST_USER_COPY test on x86 using the Android cuttlefish emulator. Signed-off-by: Arnd Bergmann Tested-by: Alice Ryhl Reviewed-by: Boqun Feng Reviewed-by: Kees Cook Signed-off-by: Alice Ryhl Acked-by: Andrew Morton Link: https://lore.kernel.org/r/20240528-alice-mm-v7-2-78222c31b8f4@google.com Signed-off-by: Miguel Ojeda --- include/linux/uaccess.h | 46 ++++++++++++++++++++++++++++++++-------------- lib/usercopy.c | 30 ++++-------------------------- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 3064314f4832..d8e4105a2f21 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,26 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -#ifdef INLINE_COPY_FROM_USER +/* + * Architectures that #define INLINE_COPY_TO_USER use this function + * directly in the normal copy_to/from_user(), the other ones go + * through an extern _copy_to/from_user(), which expands the same code + * here. + * + * Rust code always uses the extern definition. + */ static inline __must_check unsigned long -_copy_from_user(void *to, const void __user *from, unsigned long n) +_inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { + /* + * Ensure that bad access_ok() speculation will not + * lead to nasty side effects *after* the copy is + * finished: + */ + barrier_nospec(); instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); @@ -153,14 +167,11 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) memset(to + (n - res), 0, res); return res; } -#else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); -#endif -#ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long -_copy_to_user(void __user *to, const void *from, unsigned long n) +_inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) @@ -171,25 +182,32 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) } return n; } -#else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); -#endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (check_copy_size(to, n, false)) - n = _copy_from_user(to, from, n); - return n; + if (!check_copy_size(to, n, false)) + return n; +#ifdef INLINE_COPY_FROM_USER + return _inline_copy_from_user(to, from, n); +#else + return _copy_from_user(to, from, n); +#endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (check_copy_size(from, n, true)) - n = _copy_to_user(to, from, n); - return n; + if (!check_copy_size(from, n, true)) + return n; + +#ifdef INLINE_COPY_TO_USER + return _inline_copy_to_user(to, from, n); +#else + return _copy_to_user(to, from, n); +#endif } #ifndef copy_mc_to_kernel diff --git a/lib/usercopy.c b/lib/usercopy.c index 499a7a7d54db..7b17b83c8042 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,40 +12,18 @@ /* out-of-line parts */ -#ifndef INLINE_COPY_FROM_USER +#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned long res = n; - might_fault(); - if (!should_fail_usercopy() && likely(access_ok(from, n))) { - /* - * Ensure that bad access_ok() speculation will not - * lead to nasty side effects *after* the copy is - * finished: - */ - barrier_nospec(); - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user(to, from, n); - instrument_copy_from_user_after(to, from, n, res); - } - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; + return _inline_copy_from_user(to, from, n); } EXPORT_SYMBOL(_copy_from_user); #endif -#ifndef INLINE_COPY_TO_USER +#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { - might_fault(); - if (should_fail_usercopy()) - return n; - if (likely(access_ok(to, n))) { - instrument_copy_to_user(to, from, n); - n = raw_copy_to_user(to, from, n); - } - return n; + return _inline_copy_to_user(to, from, n); } EXPORT_SYMBOL(_copy_to_user); #endif -- cgit From b33bf37adbb2ae35881e7fdd997ce3334d71b129 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 28 May 2024 14:58:04 +0000 Subject: rust: uaccess: add typed accessors for userspace pointers Add safe methods for reading and writing Rust values to and from userspace pointers. The C methods for copying to/from userspace use a function called `check_object_size` to verify that the kernel pointer is not dangling. However, this check is skipped when the length is a compile-time constant, with the assumption that such cases trivially have a correct kernel pointer. In this patch, we apply the same optimization to the typed accessors. For both methods, the size of the operation is known at compile time to be size_of of the type being read or written. Since the C side doesn't provide a variant that skips only this check, we create custom helpers for this purpose. The majority of reads and writes to userspace pointers in the Rust Binder driver uses these accessor methods. Benchmarking has found that skipping the `check_object_size` check makes a big difference for the cases being skipped here. (And that the check doesn't make a difference for the cases that use the raw read/write methods.) This code is based on something that was originally written by Wedson on the old rust branch. It was modified by Alice to skip the `check_object_size` check, and to update various comments, including the notes about kernel pointers in `WritableToBytes`. Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Reviewed-by: Benno Lossin Reviewed-by: Boqun Feng Reviewed-by: Trevor Gross Reviewed-by: Gary Guo Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240528-alice-mm-v7-3-78222c31b8f4@google.com [ Wrapped docs to 100 and added a few intra-doc links. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/types.rs | 64 ++++++++++++++++++++++++++++++++++++++++ rust/kernel/uaccess.rs | 79 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 141 insertions(+), 2 deletions(-) diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index 2e7c9008621f..bd189d646adb 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -409,3 +409,67 @@ pub enum Either { /// Constructs an instance of [`Either`] containing a value of type `R`. Right(R), } + +/// Types for which any bit pattern is valid. +/// +/// Not all types are valid for all values. For example, a `bool` must be either zero or one, so +/// reading arbitrary bytes into something that contains a `bool` is not okay. +/// +/// It's okay for the type to have padding, as initializing those bytes has no effect. +/// +/// # Safety +/// +/// All bit-patterns must be valid for this type. This type must not have interior mutability. +pub unsafe trait FromBytes {} + +// SAFETY: All bit patterns are acceptable values of the types below. +unsafe impl FromBytes for u8 {} +unsafe impl FromBytes for u16 {} +unsafe impl FromBytes for u32 {} +unsafe impl FromBytes for u64 {} +unsafe impl FromBytes for usize {} +unsafe impl FromBytes for i8 {} +unsafe impl FromBytes for i16 {} +unsafe impl FromBytes for i32 {} +unsafe impl FromBytes for i64 {} +unsafe impl FromBytes for isize {} +// SAFETY: If all bit patterns are acceptable for individual values in an array, then all bit +// patterns are also acceptable for arrays of that type. +unsafe impl FromBytes for [T] {} +unsafe impl FromBytes for [T; N] {} + +/// Types that can be viewed as an immutable slice of initialized bytes. +/// +/// If a struct implements this trait, then it is okay to copy it byte-for-byte to userspace. This +/// means that it should not have any padding, as padding bytes are uninitialized. Reading +/// uninitialized memory is not just undefined behavior, it may even lead to leaking sensitive +/// information on the stack to userspace. +/// +/// The struct should also not hold kernel pointers, as kernel pointer addresses are also considered +/// sensitive. However, leaking kernel pointers is not considered undefined behavior by Rust, so +/// this is a correctness requirement, but not a safety requirement. +/// +/// # Safety +/// +/// Values of this type may not contain any uninitialized bytes. This type must not have interior +/// mutability. +pub unsafe trait AsBytes {} + +// SAFETY: Instances of the following types have no uninitialized portions. +unsafe impl AsBytes for u8 {} +unsafe impl AsBytes for u16 {} +unsafe impl AsBytes for u32 {} +unsafe impl AsBytes for u64 {} +unsafe impl AsBytes for usize {} +unsafe impl AsBytes for i8 {} +unsafe impl AsBytes for i16 {} +unsafe impl AsBytes for i32 {} +unsafe impl AsBytes for i64 {} +unsafe impl AsBytes for isize {} +unsafe impl AsBytes for bool {} +unsafe impl AsBytes for char {} +unsafe impl AsBytes for str {} +// SAFETY: If individual values in an array have no uninitialized portions, then the array itself +// does not have any uninitialized portions either. +unsafe impl AsBytes for [T] {} +unsafe impl AsBytes for [T; N] {} diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs index 55f0d7ecfa3f..e9347cff99ab 100644 --- a/rust/kernel/uaccess.rs +++ b/rust/kernel/uaccess.rs @@ -4,10 +4,16 @@ //! //! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h) -use crate::{alloc::Flags, bindings, error::Result, prelude::*}; +use crate::{ + alloc::Flags, + bindings, + error::Result, + prelude::*, + types::{AsBytes, FromBytes}, +}; use alloc::vec::Vec; use core::ffi::{c_ulong, c_void}; -use core::mem::MaybeUninit; +use core::mem::{size_of, MaybeUninit}; /// The type used for userspace addresses. pub type UserPtr = usize; @@ -247,6 +253,41 @@ impl UserSliceReader { self.read_raw(out) } + /// Reads a value of the specified type. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. + pub fn read(&mut self) -> Result { + let len = size_of::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + let mut out: MaybeUninit = MaybeUninit::uninit(); + // SAFETY: The local variable `out` is valid for writing `size_of::()` bytes. + // + // By using the _copy_from_user variant, we skip the check_object_size check that verifies + // the kernel pointer. This mirrors the logic on the C side that skips the check when the + // length is a compile-time constant. + let res = unsafe { + bindings::_copy_from_user( + out.as_mut_ptr().cast::(), + self.ptr as *const c_void, + len_ulong, + ) + }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + // SAFETY: The read above has initialized all bytes in `out`, and since `T` implements + // `FromBytes`, any bit-pattern is a valid value for this type. + Ok(unsafe { out.assume_init() }) + } + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. /// /// Fails with [`EFAULT`] if the read happens on a bad address. @@ -310,4 +351,38 @@ impl UserSliceWriter { self.length -= len; Ok(()) } + + /// Writes the provided Rust value to this userspace pointer. + /// + /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of + /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even + /// if it returns an error. + pub fn write(&mut self, value: &T) -> Result { + let len = size_of::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: The reference points to a value of type `T`, so it is valid for reading + // `size_of::()` bytes. + // + // By using the _copy_to_user variant, we skip the check_object_size check that verifies the + // kernel pointer. This mirrors the logic on the C side that skips the check when the length + // is a compile-time constant. + let res = unsafe { + bindings::_copy_to_user( + self.ptr as *mut c_void, + (value as *const T).cast::(), + len_ulong, + ) + }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } } -- cgit From fc6e66f4696b63b8a2645a2bcea407cb04bd0666 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 28 May 2024 14:58:05 +0000 Subject: rust: add abstraction for `struct page` Adds a new struct called `Page` that wraps a pointer to `struct page`. This struct is assumed to hold ownership over the page, so that Rust code can allocate and manage pages directly. The page type has various methods for reading and writing into the page. These methods will temporarily map the page to allow the operation. All of these methods use a helper that takes an offset and length, performs bounds checks, and returns a pointer to the given offset in the page. This patch only adds support for pages of order zero, as that is all Rust Binder needs. However, it is written to make it easy to add support for higher-order pages in the future. To do that, you would add a const generic parameter to `Page` that specifies the order. Most of the methods do not need to be adjusted, as the logic for dealing with mapping multiple pages at once can be isolated to just the `with_pointer_into_page` method. Rust Binder needs to manage pages directly as that is how transactions are delivered: Each process has an mmap'd region for incoming transactions. When an incoming transaction arrives, the Binder driver will choose a region in the mmap, allocate and map the relevant pages manually, and copy the incoming transaction directly into the page. This architecture allows the driver to copy transactions directly from the address space of one process to another, without an intermediate copy to a kernel buffer. This code is based on Wedson's page abstractions from the old rust branch, but it has been modified by Alice by removing the incomplete support for higher-order pages, by introducing the `with_*` helpers to consolidate the bounds checking logic into a single place, and various other changes. Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Reviewed-by: Andreas Hindborg Reviewed-by: Trevor Gross Reviewed-by: Benno Lossin Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240528-alice-mm-v7-4-78222c31b8f4@google.com [ Fixed typos and added a few intra-doc links. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/bindings/bindings_helper.h | 1 + rust/helpers.c | 20 ++++ rust/kernel/alloc.rs | 7 ++ rust/kernel/lib.rs | 1 + rust/kernel/page.rs | 250 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 279 insertions(+) create mode 100644 rust/kernel/page.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 52a1412338ef..f2bafb10f181 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -20,6 +20,7 @@ /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; +const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE; const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC; const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; diff --git a/rust/helpers.c b/rust/helpers.c index d6abe4bd45d9..305f0577fae9 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -94,6 +96,24 @@ int rust_helper_signal_pending(struct task_struct *t) } EXPORT_SYMBOL_GPL(rust_helper_signal_pending); +struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL_GPL(rust_helper_alloc_pages); + +void *rust_helper_kmap_local_page(struct page *page) +{ + return kmap_local_page(page); +} +EXPORT_SYMBOL_GPL(rust_helper_kmap_local_page); + +void rust_helper_kunmap_local(const void *addr) +{ + kunmap_local(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_kunmap_local); + refcount_t rust_helper_REFCOUNT_INIT(int n) { return (refcount_t)REFCOUNT_INIT(n); diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 396fe5a85a8f..1966bd407017 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -20,6 +20,13 @@ pub struct AllocError; #[derive(Clone, Copy)] pub struct Flags(u32); +impl Flags { + /// Get the raw representation of this flag. + pub(crate) fn as_raw(self) -> u32 { + self.0 + } +} + impl core::ops::BitOr for Flags { type Output = Self; fn bitor(self, rhs: Self) -> Self::Output { diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 767026db068e..5d310e79485f 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -35,6 +35,7 @@ pub mod ioctl; pub mod kunit; #[cfg(CONFIG_NET)] pub mod net; +pub mod page; pub mod prelude; pub mod print; mod static_assert; diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs new file mode 100644 index 000000000000..208a006d587c --- /dev/null +++ b/rust/kernel/page.rs @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel page allocation and management. + +use crate::{ + alloc::{AllocError, Flags}, + bindings, + error::code::*, + error::Result, + uaccess::UserSliceReader, +}; +use core::ptr::{self, NonNull}; + +/// A bitwise shift for the page size. +pub const PAGE_SHIFT: usize = bindings::PAGE_SHIFT as usize; + +/// The number of bytes in a page. +pub const PAGE_SIZE: usize = bindings::PAGE_SIZE; + +/// A bitmask that gives the page containing a given address. +pub const PAGE_MASK: usize = !(PAGE_SIZE - 1); + +/// A pointer to a page that owns the page allocation. +/// +/// # Invariants +/// +/// The pointer is valid, and has ownership over the page. +pub struct Page { + page: NonNull, +} + +// SAFETY: Pages have no logic that relies on them staying on a given thread, so moving them across +// threads is safe. +unsafe impl Send for Page {} + +// SAFETY: Pages have no logic that relies on them not being accessed concurrently, so accessing +// them concurrently is safe. +unsafe impl Sync for Page {} + +impl Page { + /// Allocates a new page. + /// + /// # Examples + /// + /// Allocate memory for a page. + /// + /// ``` + /// use kernel::page::Page; + /// + /// # fn dox() -> Result<(), kernel::alloc::AllocError> { + /// let page = Page::alloc_page(GFP_KERNEL)?; + /// # Ok(()) } + /// ``` + /// + /// Allocate memory for a page and zero its contents. + /// + /// ``` + /// use kernel::page::Page; + /// + /// # fn dox() -> Result<(), kernel::alloc::AllocError> { + /// let page = Page::alloc_page(GFP_KERNEL | __GFP_ZERO)?; + /// # Ok(()) } + /// ``` + pub fn alloc_page(flags: Flags) -> Result { + // SAFETY: Depending on the value of `gfp_flags`, this call may sleep. Other than that, it + // is always safe to call this method. + let page = unsafe { bindings::alloc_pages(flags.as_raw(), 0) }; + let page = NonNull::new(page).ok_or(AllocError)?; + // INVARIANT: We just successfully allocated a page, so we now have ownership of the newly + // allocated page. We transfer that ownership to the new `Page` object. + Ok(Self { page }) + } + + /// Returns a raw pointer to the page. + pub fn as_ptr(&self) -> *mut bindings::page { + self.page.as_ptr() + } + + /// Runs a piece of code with this page mapped to an address. + /// + /// The page is unmapped when this call returns. + /// + /// # Using the raw pointer + /// + /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for + /// `PAGE_SIZE` bytes and for the duration in which the closure is called. The pointer might + /// only be mapped on the current thread, and when that is the case, dereferencing it on other + /// threads is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't + /// cause data races, the memory may be uninitialized, and so on. + /// + /// If multiple threads map the same page at the same time, then they may reference with + /// different addresses. However, even if the addresses are different, the underlying memory is + /// still the same for these purposes (e.g., it's still a data race if they both write to the + /// same underlying byte at the same time). + fn with_page_mapped(&self, f: impl FnOnce(*mut u8) -> T) -> T { + // SAFETY: `page` is valid due to the type invariants on `Page`. + let mapped_addr = unsafe { bindings::kmap_local_page(self.as_ptr()) }; + + let res = f(mapped_addr.cast()); + + // This unmaps the page mapped above. + // + // SAFETY: Since this API takes the user code as a closure, it can only be used in a manner + // where the pages are unmapped in reverse order. This is as required by `kunmap_local`. + // + // In other words, if this call to `kunmap_local` happens when a different page should be + // unmapped first, then there must necessarily be a call to `kmap_local_page` other than the + // call just above in `with_page_mapped` that made that possible. In this case, it is the + // unsafe block that wraps that other call that is incorrect. + unsafe { bindings::kunmap_local(mapped_addr) }; + + res + } + + /// Runs a piece of code with a raw pointer to a slice of this page, with bounds checking. + /// + /// If `f` is called, then it will be called with a pointer that points at `off` bytes into the + /// page, and the pointer will be valid for at least `len` bytes. The pointer is only valid on + /// this task, as this method uses a local mapping. + /// + /// If `off` and `len` refers to a region outside of this page, then this method returns + /// [`EINVAL`] and does not call `f`. + /// + /// # Using the raw pointer + /// + /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for + /// `len` bytes and for the duration in which the closure is called. The pointer might only be + /// mapped on the current thread, and when that is the case, dereferencing it on other threads + /// is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't cause + /// data races, the memory may be uninitialized, and so on. + /// + /// If multiple threads map the same page at the same time, then they may reference with + /// different addresses. However, even if the addresses are different, the underlying memory is + /// still the same for these purposes (e.g., it's still a data race if they both write to the + /// same underlying byte at the same time). + fn with_pointer_into_page( + &self, + off: usize, + len: usize, + f: impl FnOnce(*mut u8) -> Result, + ) -> Result { + let bounds_ok = off <= PAGE_SIZE && len <= PAGE_SIZE && (off + len) <= PAGE_SIZE; + + if bounds_ok { + self.with_page_mapped(move |page_addr| { + // SAFETY: The `off` integer is at most `PAGE_SIZE`, so this pointer offset will + // result in a pointer that is in bounds or one off the end of the page. + f(unsafe { page_addr.add(off) }) + }) + } else { + Err(EINVAL) + } + } + + /// Maps the page and reads from it into the given buffer. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// * Callers must ensure that `dst` is valid for writing `len` bytes. + /// * Callers must ensure that this call does not race with a write to the same page that + /// overlaps with this read. + pub unsafe fn read_raw(&self, dst: *mut u8, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |src| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then + // it has performed a bounds check and guarantees that `src` is + // valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::copy_nonoverlapping(src, dst, len) }; + Ok(()) + }) + } + + /// Maps the page and writes into it from the given buffer. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// * Callers must ensure that `src` is valid for reading `len` bytes. + /// * Callers must ensure that this call does not race with a read or write to the same page + /// that overlaps with this write. + pub unsafe fn write_raw(&self, src: *const u8, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::copy_nonoverlapping(src, dst, len) }; + Ok(()) + }) + } + + /// Maps the page and zeroes the given slice. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// Callers must ensure that this call does not race with a read or write to the same page that + /// overlaps with this write. + pub unsafe fn fill_zero_raw(&self, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::write_bytes(dst, 0u8, len) }; + Ok(()) + }) + } + + /// Copies data from userspace into this page. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// Like the other `UserSliceReader` methods, data races are allowed on the userspace address. + /// However, they are not allowed on the page you are copying into. + /// + /// # Safety + /// + /// Callers must ensure that this call does not race with a read or write to the same page that + /// overlaps with this write. + pub unsafe fn copy_from_user_slice_raw( + &self, + reader: &mut UserSliceReader, + offset: usize, + len: usize, + ) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. Furthermore, we have + // exclusive access to the slice since the caller guarantees that there are no races. + reader.read_raw(unsafe { core::slice::from_raw_parts_mut(dst.cast(), len) }) + }) + } +} + +impl Drop for Page { + fn drop(&mut self) { + // SAFETY: By the type invariants, we have ownership of the page and can free it. + unsafe { bindings::__free_pages(self.page.as_ptr(), 0) }; + } +} -- cgit From 86588139b84386a47108b024d703c3c24ddbbec8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 3 Jul 2024 18:01:35 -0700 Subject: Documentation: CXL Maturity Map Provide a survey of the work-in-progress maturity (implementation status) of various aspects of the CXL subsystem. Clarify that in addition to ongoing upkeep relative to specification updates, there are some long running themes in the driver that respond to the discovery of new corner cases (bugs) and new use cases (feature extensions). The primary audience is distribution maintainers, but it also serves as a guide for kernel developers to understand what aspects of the CXL subsystem need more help. It is a landing page to document ongoing progress, and a guide to discern exposure to work-in-progress features. Reviewed-by: Adam Manzanares Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Link: https://patch.msgid.link/172005486862.2048248.6668794717827294862.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/index.rst | 2 + Documentation/driver-api/cxl/maturity-map.rst | 202 ++++++++++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 205 insertions(+) create mode 100644 Documentation/driver-api/cxl/maturity-map.rst diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst index 036e49553542..12b82725d322 100644 --- a/Documentation/driver-api/cxl/index.rst +++ b/Documentation/driver-api/cxl/index.rst @@ -9,4 +9,6 @@ Compute Express Link memory-devices + maturity-map + .. only:: subproject and html diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst new file mode 100644 index 000000000000..df8e2ac2a320 --- /dev/null +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -0,0 +1,202 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=========================================== +Compute Express Link Subsystem Maturity Map +=========================================== + +The Linux CXL subsystem tracks the dynamic `CXL specification +`_ that +continues to respond to new use cases with new features, capability +updates and fixes. At any given point some aspects of the subsystem are +more mature than others. While the periodic pull requests summarize the +`work being incorporated each merge window +`_, +those do not always convey progress relative to a starting point and a +future end goal. + +What follows is a coarse breakdown of the subsystem's major +responsibilities along with a maturity score. The expectation is that +the change-history of this document provides an overview summary of the +subsystem maturation over time. + +The maturity scores are: + +- [3] Mature: Work in this area is complete and no changes on the horizon. + Note that this score can regress from one kernel release to the next + based on new test results or end user reports. + +- [2] Stabilizing: Major functionality operational, common cases are + mature, but known corner cases are still a work in progress. + +- [1] Initial: Capability that has exited the Proof of Concept phase, but + may still have significant gaps to close and fixes to apply as real + world testing occurs. + +- [0] Known gap: Feature is on a medium to long term horizon to + implement. If the specification has a feature that does not even have + a '0' score in this document, there is a good chance that no one in + the linux-cxl@vger.kernel.org community has started to look at it. + +- X: Out of scope for kernel enabling, or kernel enabling not required + +Feature and Capabilities +======================== + +Enumeration / Provisioning +-------------------------- +All of the fundamental enumeration an object model of the subsystem is +in place, but there are several corner cases that are pending closure. + + +* [2] CXL Window Enumeration + + * [0] :ref:`Extended-linear memory-side cache ` + * [0] Low Memory-hole + * [0] Hetero-interleave + +* [2] Switch Enumeration + + * [0] CXL register enumeration link-up dependency + +* [2] HDM Decoder Configuration + + * [0] Decoder target and granularity constraints + +* [2] Performance enumeration + + * [3] Endpoint CDAT + * [3] Switch CDAT + * [1] CDAT to Core-mm integration + + * [1] x86 + * [0] Arm64 + * [0] All other arch. + + * [0] Shared link + +* [2] Hotplug + (see CXL Window Enumeration) + + * [0] Handle Soft Reserved conflicts + +* [0] :ref:`RCH link status ` +* [0] Fabrics / G-FAM (chapter 7) +* [0] Global Access Endpoint + + +RAS +--- +In many ways CXL can be seen as a standardization of what would normally +be handled by custom EDAC drivers. The open development here is +mainly caused by the enumeration corner cases above. + +* [3] Component events (OS) +* [2] Component events (FFM) +* [1] Endpoint protocol errors (OS) +* [1] Endpoint protocol errors (FFM) +* [0] Switch protocol errors (OS) +* [1] Switch protocol errors (FFM) +* [2] DPA->HPA Address translation + + * [1] XOR Interleave translation + (see CXL Window Enumeration) + +* [1] Memory Failure coordination +* [0] Scrub control +* [2] ACPI error injection EINJ + + * [0] EINJ v2 + * [X] Compliance DOE + +* [2] Native error injection +* [3] RCH error handling +* [1] VH error handling +* [0] PPR +* [0] Sparing +* [0] Device built in test + + +Mailbox commands +---------------- + +* [3] Firmware update +* [3] Health / Alerts +* [1] :ref:`Background commands ` +* [3] Sanitization +* [3] Security commands +* [3] RAW Command Debug Passthrough +* [0] CEL-only-validation Passthrough +* [0] Switch CCI +* [3] Timestamp +* [1] PMEM labels +* [0] PMEM GPF / Dirty Shutdown +* [0] Scan Media + +PMU +--- +* [1] Type 3 PMU +* [0] Switch USP/ DSP, Root Port + +Security +-------- + +* [X] CXL Trusted Execution Environment Security Protocol (TSP) +* [X] CXL IDE (subsumed by TSP) + +Memory-pooling +-------------- + +* [1] Hotplug of LDs (via PCI hotplug) +* [0] Dynamic Capacity Device (DCD) Support + +Multi-host sharing +------------------ + +* [0] Hardware coherent shared memory +* [0] Software managed coherency shared memory + +Multi-host memory +----------------- + +* [0] Dynamic Capacity Device Support +* [0] Sharing + +Accelerator +----------- + +* [0] Accelerator memory enumeration HDM-D (CXL 1.1/2.0 Type-2) +* [0] Accelerator memory enumeration HDM-DB (CXL 3.0 Type-2) +* [0] CXL.cache 68b (CXL 2.0) +* [0] CXL.cache 256b Cache IDs (CXL 3.0) + +User Flow Support +----------------- + +* [0] HPA->DPA Address translation (need xormaps export solution) + +Details +======= + +.. _extended-linear: + +* **Extended-linear memory-side cache**: An HMAT proposal to enumerate the presence of a + memory-side cache where the cache capacity extends the SRAT address + range capacity. `See the ECN + `_ + for more details: + +.. _rch-link-status: + +* **RCH Link Status**: RCH (Restricted CXL Host) topologies, end up + hiding some standard registers like PCIe Link Status / Capabilities in + the CXL RCRB (Root Complex Register Block). + +.. _background-commands: + +* **Background commands**: The CXL background command mechanism is + awkward as the single slot is monopolized potentially indefinitely by + various commands. A `cancel on conflict + `_ + facility is needed to make sure the kernel can ensure forward progress + of priority commands. diff --git a/MAINTAINERS b/MAINTAINERS index 3c4fdf74a3f9..ce047ab7e977 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5485,6 +5485,7 @@ M: Ira Weiny M: Dan Williams L: linux-cxl@vger.kernel.org S: Maintained +F: Documentation/driver-api/cxl F: drivers/cxl/ F: include/linux/einj-cxl.h F: include/linux/cxl-event.h -- cgit From 67bab430f4e71332aced7562a67d73444f5b4b77 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 14:31:52 +0200 Subject: tools/power turbostat: Group SMI counter with APERF and MPERF These three counters now are treated similar to other perf counters groups. This simplifies and gets rid of a lot of special cases for APERF and MPERF. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 635 +++++++++++++++------------------- 1 file changed, 280 insertions(+), 355 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 29751b41ea0d..495235055fa2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -85,7 +85,6 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; -enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; struct sysfs_path { @@ -275,7 +274,6 @@ char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; -struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -290,6 +288,7 @@ unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int has_aperf; +unsigned int has_aperf_access; unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; @@ -330,7 +329,6 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; -enum amperf_source amperf_source; enum gfx_sysfs_idx { GFX_rc6, @@ -1381,6 +1379,67 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { }, }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum msr_rci_index { + MSR_RCI_INDEX_APERF = 0, + MSR_RCI_INDEX_MPERF = 1, + MSR_RCI_INDEX_SMI = 2, + NUM_MSR_COUNTERS, +}; + +struct msr_counter_info_t { + unsigned long long data[NUM_MSR_COUNTERS]; + enum counter_source source[NUM_MSR_COUNTERS]; + unsigned long long msr[NUM_MSR_COUNTERS]; + unsigned long long msr_mask[NUM_MSR_COUNTERS]; + int fd_perf; +}; + +struct msr_counter_info_t *msr_counter_info; +unsigned int msr_counter_info_size; + +struct msr_counter_arch_info { + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + bool needed; + bool present; +}; + +enum msr_arch_info_index { + MSR_ARCH_INFO_APERF_INDEX = 0, + MSR_ARCH_INFO_MPERF_INDEX = 1, + MSR_ARCH_INFO_SMI_INDEX = 2, +}; + +static struct msr_counter_arch_info msr_counter_arch_infos[] = { + [MSR_ARCH_INFO_APERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "aperf", + .msr = MSR_IA32_APERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_APERF, + }, + + [MSR_ARCH_INFO_MPERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "mperf", + .msr = MSR_IA32_MPERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_MPERF, + }, + + [MSR_ARCH_INFO_SMI_INDEX] = { + .perf_subsys = "msr", + .perf_name = "smi", + .msr = MSR_SMI_COUNT, + .msr_mask = 0xFFFFFFFF, + .rci_index = MSR_RCI_INDEX_SMI, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1767,7 +1826,7 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; @@ -3402,30 +3461,6 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned int read_msr_type(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/type"; - const char *const format = "%u"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_aperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_mperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; @@ -3479,124 +3514,6 @@ static double read_perf_rapl_scale(const char *subsys, const char *event_name) return scale; } -static struct amperf_group_fd open_amperf_fd(int cpu) -{ - const unsigned int msr_type = read_msr_type(); - const unsigned int aperf_config = read_aperf_config(); - const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; - - fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); - - return fds; -} - -static int get_amperf_fd(int cpu) -{ - assert(fd_amperf_percpu); - - if (fd_amperf_percpu[cpu].aperf) - return fd_amperf_percpu[cpu].aperf; - - fd_amperf_percpu[cpu] = open_amperf_fd(cpu); - - return fd_amperf_percpu[cpu].aperf; -} - -/* Read APERF, MPERF and TSC using the perf API. */ -static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) -{ - union { - struct { - unsigned long nr_entries; - unsigned long aperf; - unsigned long mperf; - }; - - unsigned long as_array[3]; - } cnt; - - const int fd_amperf = get_amperf_fd(cpu); - - /* - * Read the TSC with rdtsc, because we want the absolute value and not - * the offset from the start of the counter. - */ - t->tsc = rdtsc(); - - const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); - - if (n != sizeof(cnt.as_array)) - return -2; - - t->aperf = cnt.aperf * aperf_mperf_multiplier; - t->mperf = cnt.mperf * aperf_mperf_multiplier; - - return 0; -} - -/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ -static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) -{ - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - int aperf_mperf_retry_count = 0; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - -retry: - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; - - tsc_after = rdtsc(); - - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); - } - aperf_mperf_retry_count = 0; - - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; - - return 0; -} - size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) { size_t ret = 0; @@ -3853,6 +3770,84 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat return 0; } +size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) + if (mci->source[i] == COUNTER_SOURCE_PERF) + ++ret; + + return ret; +} + +int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) +{ + unsigned long long perf_data[NUM_MSR_COUNTERS + 1]; + + struct msr_counter_info_t *mci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(msr_counter_info); + assert(cpu <= msr_counter_info_size); + + mci = &msr_counter_info[cpu]; + + ZERO_ARRAY(perf_data); + ZERO_ARRAY(mci->data); + + if (mci->fd_perf != -1) { + const size_t num_perf_counters = msr_counter_info_count_perf(mci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) { + switch (mci->source[i]) { + case COUNTER_SOURCE_NONE: + break; + + case COUNTER_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(mci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]); + + mci->data[i] = perf_data[pi]; + + ++pi; + break; + + case COUNTER_SOURCE_MSR: + assert(!no_msr); + + if (get_msr(cpu, mci->msr[i], &mci->data[i])) + return -2 - i; + + mci->data[i] &= mci->msr_mask[i]; + + if (debug) + fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]); + + break; + } + } + + BUILD_BUG_ON(NUM_MSR_COUNTERS != 3); + t->aperf = mci->data[MSR_RCI_INDEX_APERF]; + t->mperf = mci->data[MSR_RCI_INDEX_MPERF]; + t->smi_count = mci->data[MSR_RCI_INDEX_SMI]; + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3878,24 +3873,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) - || soft_c1_residency_display(BIC_Avg_MHz)) { - int status = -1; - - assert(!no_perf || !no_msr); - - switch (amperf_source) { - case AMPERF_SOURCE_PERF: - status = read_aperf_mperf_tsc_perf(t, cpu); - break; - case AMPERF_SOURCE_MSR: - status = read_aperf_mperf_tsc_msr(t, cpu); - break; - } - - if (status != 0) - return status; - } + get_smi_aperf_mperf(cpu, t); if (DO_BIC(BIC_IPC)) if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) @@ -3903,11 +3881,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; - if (DO_BIC(BIC_SMI)) { - if (get_msr(cpu, MSR_SMI_COUNT, &msr)) - return -5; - t->smi_count = msr & 0xFFFFFFFF; - } get_cstate_counters(cpu, t, c, p); @@ -4489,25 +4462,6 @@ void free_fd_percpu(void) fd_percpu = NULL; } -void free_fd_amperf_percpu(void) -{ - int i; - - if (!fd_amperf_percpu) - return; - - for (i = 0; i < topo.max_cpu_num + 1; ++i) { - if (fd_amperf_percpu[i].mperf != 0) - close(fd_amperf_percpu[i].mperf); - - if (fd_amperf_percpu[i].aperf != 0) - close(fd_amperf_percpu[i].aperf); - } - - free(fd_amperf_percpu); - fd_amperf_percpu = NULL; -} - void free_fd_instr_count_percpu(void) { if (!fd_instr_count_percpu) @@ -4542,6 +4496,21 @@ void free_fd_cstate(void) ccstate_counter_info_size = 0; } +void free_fd_msr(void) +{ + if (!msr_counter_info) + return; + + for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) { + if (msr_counter_info[cpu].fd_perf != -1) + close(msr_counter_info[cpu].fd_perf); + } + + free(msr_counter_info); + msr_counter_info = NULL; + msr_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4601,7 +4570,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); - free_fd_amperf_percpu(); + free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); @@ -4938,6 +4907,7 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); @@ -4946,6 +4916,7 @@ void re_initialize(void) free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + msr_perf_init(); rapl_perf_init(); cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, @@ -6799,15 +6770,6 @@ static int has_instr_count_access(void) return has_access; } -bool is_aperf_access_required(void) -{ - return BIC_IS_ENABLED(BIC_Avg_MHz) - || BIC_IS_ENABLED(BIC_Busy) - || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC) - || BIC_IS_ENABLED(BIC_CPU_c1); -} - int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, double *scale_, enum rapl_unit *unit_) { @@ -6866,14 +6828,6 @@ void linux_perf_init(void) if (fd_instr_count_percpu == NULL) err(-1, "calloc fd_instr_count_percpu"); } - - const bool aperf_required = is_aperf_access_required(); - - if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { - fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); - if (fd_amperf_percpu == NULL) - err(-1, "calloc fd_amperf_percpu"); - } } void rapl_perf_init(void) @@ -6966,75 +6920,11 @@ void rapl_perf_init(void) free(domain_visited); } -static int has_amperf_access_via_msr(void) -{ - if (no_msr) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_APERF)) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_MPERF)) - return 0; - - return 1; -} - -static int has_amperf_access_via_perf(void) -{ - struct amperf_group_fd fds; - - /* - * Cache the last result, so we don't warn the user multiple times - * - * Negative means cached, no access - * Zero means not cached - * Positive means cached, has access - */ - static int has_access_cached; - - if (no_perf) - return 0; - - if (has_access_cached != 0) - return has_access_cached > 0; - - fds = open_amperf_fd(base_cpu); - has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); - - if (fds.aperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "APERF perf counter", "--no-perf"); - else - close(fds.aperf); - - if (fds.mperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "MPERF perf counter", "--no-perf"); - else - close(fds.mperf); - - if (has_access_cached == 0) - has_access_cached = -1; - - return has_access_cached > 0; -} - -/* Check if we can access APERF and MPERF */ +/* Assumes msr_counter_info is populated */ static int has_amperf_access(void) { - if (!is_aperf_access_required()) - return 0; - - if (!no_msr && has_amperf_access_via_msr()) - return 1; - - if (!no_perf && has_amperf_access_via_perf()) - return 1; - - return 0; + return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; } int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) @@ -7083,6 +6973,114 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st return ret; } +int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + if (no_perf) + return -1; + + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (cci->fd_perf == -1) + cci->fd_perf = fd_counter; + + return fd_counter; +} + +int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + int ret = add_msr_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu); + + return ret; +} + +void msr_perf_init_(void) +{ + const int mci_num = topo.max_cpu_num + 1; + + msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info)); + if (!msr_counter_info) + err(1, "calloc msr_counter_info"); + msr_counter_info_size = mci_num; + + for (int cpu = 0; cpu < mci_num; ++cpu) + msr_counter_info[cpu].fd_perf = -1; + + for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) { + + struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx]; + + cai->present = false; + + for (int cpu = 0; cpu < mci_num; ++cpu) { + + struct msr_counter_info_t *const cci = &msr_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + if (cai->needed) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) { + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; + cai->present = true; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + cci->msr_mask[cai->rci_index] = cai->msr_mask; + cai->present = true; + } + } + } + } +} + +/* Initialize data for reading perf counters from the MSR group. */ +void msr_perf_init(void) +{ + bool need_amperf = false, need_smi = false; + const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1); + + need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1; + + if (BIC_IS_ENABLED(BIC_SMI)) + need_smi = true; + + /* Enable needed counters */ + msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi; + + msr_perf_init_(); + + const bool has_amperf = has_amperf_access(); + const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present; + + has_aperf_access = has_amperf; + + if (has_amperf) { + BIC_PRESENT(BIC_Avg_MHz); + BIC_PRESENT(BIC_Busy); + BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_SMI); + } + + if (has_smi) + BIC_PRESENT(BIC_SMI); +} + void cstate_perf_init_(bool soft_c1) { bool has_counter; @@ -7340,12 +7338,6 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf && has_amperf_access()) { - BIC_PRESENT(BIC_Avg_MHz); - BIC_PRESENT(BIC_Busy); - BIC_PRESENT(BIC_Bzy_MHz); - BIC_PRESENT(BIC_IPC); - } do_dts = eax & (1 << 0); if (do_dts) BIC_PRESENT(BIC_CoreTmp); @@ -7462,6 +7454,11 @@ static void counter_info_init(void) if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; } + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) { + msr_counter_arch_infos[i].present = false; + msr_counter_arch_infos[i].needed = false; + } } void probe_pm_features(void) @@ -7837,21 +7834,6 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } -static void set_amperf_source(void) -{ - amperf_source = AMPERF_SOURCE_PERF; - - const bool aperf_required = is_aperf_access_required(); - - if (no_perf || !aperf_required || !has_amperf_access_via_perf()) - amperf_source = AMPERF_SOURCE_MSR; - - if (quiet || !debug) - return; - - fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); -} - bool has_added_counters(void) { /* @@ -7862,54 +7844,8 @@ bool has_added_counters(void) return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; } -bool is_msr_access_required(void) -{ - if (no_msr) - return false; - - if (has_added_counters()) - return true; - - return BIC_IS_ENABLED(BIC_SMI) - || BIC_IS_ENABLED(BIC_CPU_c1) - || BIC_IS_ENABLED(BIC_CPU_c3) - || BIC_IS_ENABLED(BIC_CPU_c6) - || BIC_IS_ENABLED(BIC_CPU_c7) - || BIC_IS_ENABLED(BIC_Mod_c6) - || BIC_IS_ENABLED(BIC_CoreTmp) - || BIC_IS_ENABLED(BIC_Totl_c0) - || BIC_IS_ENABLED(BIC_Any_c0) - || BIC_IS_ENABLED(BIC_GFX_c0) - || BIC_IS_ENABLED(BIC_CPUGFX) - || BIC_IS_ENABLED(BIC_Pkgpc3) - || BIC_IS_ENABLED(BIC_Pkgpc6) - || BIC_IS_ENABLED(BIC_Pkgpc2) - || BIC_IS_ENABLED(BIC_Pkgpc7) - || BIC_IS_ENABLED(BIC_Pkgpc8) - || BIC_IS_ENABLED(BIC_Pkgpc9) - || BIC_IS_ENABLED(BIC_Pkgpc10) - /* TODO: Multiplex access with perf */ - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_PkgWatt) - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_GFXWatt) - || BIC_IS_ENABLED(BIC_RAMWatt) - || BIC_IS_ENABLED(BIC_Pkg_J) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_GFX_J) - || BIC_IS_ENABLED(BIC_RAM_J) - || BIC_IS_ENABLED(BIC_PKG__) - || BIC_IS_ENABLED(BIC_RAM__) - || BIC_IS_ENABLED(BIC_PkgTmp) - || (is_aperf_access_required() && !has_amperf_access_via_perf()); -} - void check_msr_access(void) { - if (!is_msr_access_required()) - no_msr = 1; - check_dev_msr(); check_msr_permission(); @@ -7919,19 +7855,8 @@ void check_msr_access(void) void check_perf_access(void) { - const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); - - if (no_perf || !intrcount_required || !has_instr_count_access()) + if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; - - const bool aperf_required = is_aperf_access_required(); - - if (!aperf_required || !has_amperf_access()) { - bic_enabled &= ~BIC_Avg_MHz; - bic_enabled &= ~BIC_Busy; - bic_enabled &= ~BIC_Bzy_MHz; - bic_enabled &= ~BIC_IPC; - } } void turbostat_init() @@ -7943,7 +7868,7 @@ void turbostat_init() process_cpuid(); counter_info_init(); probe_pm_features(); - set_amperf_source(); + msr_perf_init(); linux_perf_init(); rapl_perf_init(); cstate_perf_init(); @@ -7951,8 +7876,8 @@ void turbostat_init() for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (DO_BIC(BIC_IPC)) - (void)get_instr_count_fd(base_cpu); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1) + BIC_PRESENT(BIC_IPC); /* * If TSC tweak is needed, but couldn't get it, -- cgit From 361b8fc73cf63dc0c3be3778720631f1a33ba9db Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 28 May 2024 15:46:10 +0200 Subject: tools/power turbostat: Extend --add option with perf counters User can now read perf counters using "--add perf//". Other details work similarly to how --add works with MSRs. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 1 + tools/power/x86/turbostat/turbostat.8 | 5 +- tools/power/x86/turbostat/turbostat.c | 562 +++++++++++++++++++++++++++++++++- 3 files changed, 557 insertions(+), 11 deletions(-) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index b1e6817f1e54..3946d5254a1f 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -46,6 +46,7 @@ snapshot: turbostat @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h @echo PWD=. > $(SNAPSHOT)/Makefile @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8d37acd39201..5537fc6b5bc3 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -28,10 +28,13 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--add attributes\fP add column with counter having specified 'attributes'. The 'location' attribute is required, all others are optional. .nf - location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP} + location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP | \fBperf//\fP} msrDDD is a decimal offset, eg. msr16 msr0xXXX is a hex offset, eg. msr0x10 /sys/path... is an absolute path to a sysfs attribute + is a perf device from /sys/bus/event_source/devices/ eg. cstate_core + is a perf event for given device from /sys/bus/event_source/devices//events/ eg. c1-residency + perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency scope: {\fBcpu\fP | \fBcore\fP | \fBpackage\fP} sample and print the counter for every cpu, core, or package. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 495235055fa2..be345a4bbe96 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -79,14 +79,40 @@ */ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define PERF_NAME_BYTES 128 #define MAX_NOFILE 0x8000 +#define COUNTER_KIND_PERF_PREFIX "perf/" +#define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX) +#define PERF_DEV_NAME_BYTES 32 +#define PERF_EVT_NAME_BYTES 32 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; +struct perf_counter_info { + struct perf_counter_info *next; + + /* How to open the counter / What counter it is. */ + char device[PERF_DEV_NAME_BYTES]; + char event[PERF_EVT_NAME_BYTES]; + + /* How to show/format the counter. */ + char name[PERF_NAME_BYTES]; + unsigned int width; + enum counter_scope scope; + enum counter_type type; + enum counter_format format; + double scale; + + /* For reading the counter. */ + int *fd_perf_per_domain; + size_t num_domains; +}; + struct sysfs_path { char path[PATH_BYTES]; int id; @@ -1457,6 +1483,7 @@ struct thread_data { unsigned int flags; bool is_atom; unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { @@ -1470,6 +1497,7 @@ struct core_data { unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1503,6 +1531,7 @@ struct pkg_data { unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1637,12 +1666,21 @@ int idx_valid(int idx) } struct sys_counters { + /* MSR added counters */ unsigned int added_thread_counters; unsigned int added_core_counters; unsigned int added_package_counters; struct msr_counter *tp; struct msr_counter *cp; struct msr_counter *pp; + + /* perf added counters */ + unsigned int added_thread_perf_counters; + unsigned int added_core_perf_counters; + unsigned int added_package_perf_counters; + struct perf_counter_info *perf_tp; + struct perf_counter_info *perf_cp; + struct perf_counter_info *perf_pp; } sys; static size_t free_msr_counters_(struct msr_counter **pp) @@ -1902,6 +1940,23 @@ int probe_msr(int cpu, off_t offset) return 0; } +/* Convert CPU ID to domain ID for given added perf counter. */ +unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu) +{ + switch (pc->scope) { + case SCOPE_CPU: + return cpu; + + case SCOPE_CORE: + return cpus[cpu].physical_core_id; + + case SCOPE_PACKAGE: + return cpus[cpu].physical_package_id; + } + + __builtin_unreachable(); +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1925,6 +1980,7 @@ void help(void) "to print statistics, until interrupted.\n" " -a, --add add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" @@ -2034,6 +2090,7 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) void print_header(char *delim) { struct msr_counter *mp; + struct perf_counter_info *pp; int printed = 0; if (DO_BIC(BIC_USEC)) @@ -2091,6 +2148,21 @@ void print_header(char *delim) } } + for (pp = sys.perf_tp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_c3)) @@ -2131,6 +2203,21 @@ void print_header(char *delim) } } + for (pp = sys.perf_cp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2226,6 +2313,21 @@ void print_header(char *delim) } } + for (pp = sys.perf_pp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + outp += sprintf(outp, "\n"); } @@ -2346,6 +2448,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data char *fmt8; int i; struct msr_counter *mp; + struct perf_counter_info *pp; char *delim = "\t"; int printed = 0; @@ -2483,6 +2586,31 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + /* Added perf counters */ + for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + if (pp->type == COUNTER_USEC) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + t->perf_counter[i] / interval_float / 10000); + else + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc); + } + } + /* C1 */ if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); @@ -2526,6 +2654,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc); + } + } + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) @@ -2680,6 +2826,26 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); + } else if (pp->type == COUNTER_K2M) + outp += + sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + } + done: if (*(outp - 1) != '\n') outp += sprintf(outp, "\n"); @@ -2733,6 +2899,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; @@ -2793,6 +2960,15 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->counter[i] = new->counter[i] - old->counter[i]; } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else if (pp->format == FORMAT_AVERAGE) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + return 0; } @@ -2800,6 +2976,7 @@ void delta_core(struct core_data *new, struct core_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; @@ -2816,6 +2993,13 @@ void delta_core(struct core_data *new, struct core_data *old) else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } } int soft_c1_residency_display(int bic) @@ -2833,6 +3017,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d { int i; struct msr_counter *mp; + struct perf_counter_info *pp; /* we run cpuid just the 1st time, copy the results */ if (DO_BIC(BIC_APIC)) @@ -2911,6 +3096,14 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + return 0; } @@ -3013,6 +3206,10 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) p->counter[i] = 0; + + memset(&t->perf_counter[0], 0, sizeof(t->perf_counter)); + memset(&c->perf_counter[0], 0, sizeof(c->perf_counter)); + memset(&p->perf_counter[0], 0, sizeof(p->perf_counter)); } void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) @@ -3033,6 +3230,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) @@ -3063,6 +3261,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.counter[i] += t->counter[i]; } + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.threads.perf_counter[i] += t->perf_counter[i]; + } + /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3083,6 +3287,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.counter[i] += c->counter[i]; } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.cores.perf_counter[i] += c->perf_counter[i]; + } + /* sum per-pkg values only for 1st core in pkg */ if (!is_cpu_first_core_in_package(t, c, p)) return 0; @@ -3134,6 +3344,14 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) else average.packages.counter[i] += p->counter[i]; } + + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.perf_counter[i] = p->perf_counter[i]; + else + average.packages.perf_counter[i] += p->perf_counter[i]; + } + return 0; } @@ -3145,6 +3363,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data { int i; struct msr_counter *mp; + struct perf_counter_info *pp; clear_counters(&average.threads, &average.cores, &average.packages); @@ -3216,6 +3435,35 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data } average.packages.counter[i] /= topo.allowed_packages; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.threads.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + continue; + } + average.threads.perf_counter[i] /= topo.allowed_cpus; + } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.cores.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.cores.perf_counter[i] /= topo.allowed_cores; + } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.packages.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.packages.perf_counter[i] /= topo.allowed_packages; + } } static unsigned long long rdtsc(void) @@ -3848,6 +4096,31 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) return 0; } +int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size) +{ + unsigned int domain; + unsigned long long value; + int fd_counter; + + for (size_t i = 0; pp; ++i, pp = pp->next) { + domain = cpu_to_domain(pp, cpu); + assert(domain < pp->num_domains); + + fd_counter = pp->fd_perf_per_domain[domain]; + + if (fd_counter == -1) + continue; + + if (read(fd_counter, &value, sizeof(value)) != sizeof(value)) + return 1; + + assert(i < out_size); + out[i] = value * pp->scale; + } + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3889,6 +4162,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS)) + return -10; + /* collect core counters only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) goto done; @@ -3927,6 +4203,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS)) + return -10; + /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) goto done; @@ -3999,6 +4278,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } + + if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS)) + return -10; + done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -4528,6 +4811,36 @@ void free_fd_rapl_percpu(void) rapl_counter_info_perdomain_size = 0; } +void free_fd_added_perf_counters_(struct perf_counter_info *pp) +{ + if (!pp) + return; + + if (!pp->fd_perf_per_domain) + return; + + while (pp) { + for (size_t domain = 0; domain < pp->num_domains; ++domain) { + if (pp->fd_perf_per_domain[domain] != -1) { + close(pp->fd_perf_per_domain[domain]); + pp->fd_perf_per_domain[domain] = -1; + } + } + + free(pp->fd_perf_per_domain); + pp->fd_perf_per_domain = NULL; + + pp = pp->next; + } +} + +void free_fd_added_perf_counters(void) +{ + free_fd_added_perf_counters_(sys.perf_tp); + free_fd_added_perf_counters_(sys.perf_cp); + free_fd_added_perf_counters_(sys.perf_pp); +} + void free_all_buffers(void) { int i; @@ -4573,6 +4886,7 @@ void free_all_buffers(void) free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); + free_fd_added_perf_counters(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4910,6 +5224,7 @@ void linux_perf_init(void); void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); +void added_perf_counters_init(void); void re_initialize(void) { @@ -4919,6 +5234,7 @@ void re_initialize(void) msr_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -7859,6 +8175,117 @@ void check_perf_access(void) bic_enabled &= ~BIC_IPC; } +int added_perf_counters_init_(struct perf_counter_info *pinfo) +{ + size_t num_domains = 0; + unsigned int next_domain; + bool *domain_visited; + unsigned int perf_type, perf_config; + double perf_scale; + int fd_perf; + + if (!pinfo) + return 0; + + const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1)); + + domain_visited = calloc(max_num_domains, sizeof(*domain_visited)); + + while (pinfo) { + switch (pinfo->scope) { + case SCOPE_CPU: + num_domains = topo.max_cpu_num + 1; + break; + + case SCOPE_CORE: + num_domains = topo.max_core_id + 1; + break; + + case SCOPE_PACKAGE: + num_domains = topo.max_package_id + 1; + break; + } + + /* Allocate buffer for file descriptor for each domain. */ + pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain)); + if (!pinfo->fd_perf_per_domain) + errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain"); + + for (size_t i = 0; i < num_domains; ++i) + pinfo->fd_perf_per_domain[i] = -1; + + pinfo->num_domains = num_domains; + pinfo->scale = 1.0; + + memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + next_domain = cpu_to_domain(pinfo, cpu); + + assert(next_domain < num_domains); + + if (cpu_is_not_allowed(cpu)) + continue; + + if (domain_visited[next_domain]) + continue; + + perf_type = read_perf_type(pinfo->device); + if (perf_type == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "type"); + continue; + } + + perf_config = read_rapl_config(pinfo->device, pinfo->event); + if (perf_config == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "config"); + continue; + } + + /* Scale is not required, some counters just don't have it. */ + perf_scale = read_perf_rapl_scale(pinfo->device, pinfo->event); + if (perf_scale == 0.0) + perf_scale = 1.0; + + fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0); + if (fd_perf == -1) { + warnx("%s: perf/%s/%s: failed to open counter on cpu%d", + __func__, pinfo->device, pinfo->event, cpu); + continue; + } + + domain_visited[next_domain] = 1; + pinfo->fd_perf_per_domain[next_domain] = fd_perf; + pinfo->scale = perf_scale; + + if (debug) + printf("Add perf/%s/%s cpu%d: %d\n", + pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + } + + pinfo = pinfo->next; + } + + free(domain_visited); + + return 0; +} + +void added_perf_counters_init(void) +{ + if (added_perf_counters_init_(sys.perf_tp)) + errx(1, "%s: %s", __func__, "thread"); + + if (added_perf_counters_init_(sys.perf_cp)) + errx(1, "%s: %s", __func__, "core"); + + if (added_perf_counters_init_(sys.perf_pp)) + errx(1, "%s: %s", __func__, "package"); +} + void turbostat_init() { setup_all_buffers(true); @@ -7872,6 +8299,7 @@ void turbostat_init() linux_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -8061,6 +8489,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; strncpy(msrp->name, name, NAME_BYTES - 1); msrp->width = width; @@ -8101,11 +8530,106 @@ int add_counter(unsigned int msr_num, char *path, char *name, return 0; } +/* + * Initialize the fields used for identifying and opening the counter. + * + * Defer the initialization of any runtime buffers for actually reading + * the counters for when we initialize all perf counters, so we can later + * easily call re_initialize(). + */ +struct perf_counter_info *make_perf_counter_info(const char *perf_device, + const char *perf_event, + const char *name, + unsigned int width, + enum counter_scope scope, + enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + pinfo = calloc(1, sizeof(*pinfo)); + if (!pinfo) + errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event); + + strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1); + strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1); + + strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1); + pinfo->width = width; + pinfo->scope = scope; + pinfo->type = type; + pinfo->format = format; + + return pinfo; +} + +int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width, + enum counter_scope scope, enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + switch (scope) { + case SCOPE_CPU: + if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_CORE: + if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_PACKAGE: + if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + } + + pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format); + + if (!pinfo) + return -1; + + switch (scope) { + case SCOPE_CPU: + pinfo->next = sys.perf_tp; + sys.perf_tp = pinfo; + ++sys.added_thread_perf_counters; + break; + + case SCOPE_CORE: + pinfo->next = sys.perf_cp; + sys.perf_cp = pinfo; + ++sys.added_core_perf_counters; + break; + + case SCOPE_PACKAGE: + pinfo->next = sys.perf_pp; + sys.perf_pp = pinfo; + ++sys.added_package_perf_counters; + break; + } + + // FIXME: we might not have debug here yet + if (debug) + printf("%s: %s/%s, name: %s, scope%d\n", + __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + + return 0; +} + void parse_add_command(char *add_command) { int msr_num = 0; char *path = NULL; - char name_buffer[NAME_BYTES] = ""; + char perf_device[PERF_DEV_NAME_BYTES] = ""; + char perf_event[PERF_EVT_NAME_BYTES] = ""; + char name_buffer[PERF_NAME_BYTES] = ""; int width = 64; int fail = 0; enum counter_scope scope = SCOPE_CPU; @@ -8120,6 +8644,11 @@ void parse_add_command(char *add_command) if (sscanf(add_command, "msr%d", &msr_num) == 1) goto next; + BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31); + BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31); + if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2) + goto next; + if (*add_command == '/') { path = add_command; goto next; @@ -8167,7 +8696,8 @@ void parse_add_command(char *add_command) goto next; } - if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { /* 18 < NAME_BYTES */ + BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18); + if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { char *eos; eos = strchr(name_buffer, ','); @@ -8184,21 +8714,33 @@ next: } } - if ((msr_num == 0) && (path == NULL)) { - fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n"); + if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) { + fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n"); fail++; } + /* Test for non-empty perf_device and perf_event */ + const bool is_perf_counter = perf_device[0] && perf_event[0]; + /* generate default column header */ if (*name_buffer == '\0') { - if (width == 32) - sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); - else - sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + if (is_perf_counter) { + snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event); + } else { + if (width == 32) + sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + else + sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + } } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) - fail++; + if (is_perf_counter) { + if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format)) + fail++; + } else { + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) + fail++; + } if (fail) { help(); -- cgit From 25826c20da55a114c2cba8c4f237dfe7a7b4f8f6 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 9 Jul 2024 14:22:30 +0200 Subject: tools/power turbostat: Fix formatting in turbostat.8 We had an extra "+" at the beginning of some lines that look like a poorly formated patch. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 5537fc6b5bc3..8a21e9b56071 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -70,10 +70,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP -+\fB--no-msr\fP Disable all the uses of the MSR driver. -+.PP -+\fB--no-perf\fP Disable all the uses of the perf API. -+.PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP +\fB--no-perf\fP Disable all the uses of the perf API. +.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. -- cgit From 9f50066b0dd47cff045ba6da37e6def52783ba55 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 9 Jul 2024 14:46:20 +0200 Subject: tools/power turbostat: Add perf added counter example to turbostat.8 We had few lines about the feature, but without any complete examples. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8a21e9b56071..71ae2d5159ad 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -336,6 +336,24 @@ CPU PRF_CTRL .fi +.SH ADD PERF COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. +We add a counter showing time spent in C1 core cstate, +labeling it with the column header, "pCPU%c1", and display it only once, +after the conclusion of 0.1 second sleep. +We also show CPU%c1 built-in counter that should show similar values. +.nf +sudo ./turbostat --quiet --cpu 0-3 --show CPU,CPU%c1 --add perf/cstate_core/c1-residency,cpu,delta,percent,pCPU%c1 sleep .1 +0.102448 sec +CPU pCPU%c1 CPU%c1 +- 34.89 34.89 +0 45.99 45.99 +1 45.94 45.94 +2 23.83 23.83 +3 23.84 23.84 + +.fi + .SH INPUT For interval-mode, turbostat will immediately end the current interval -- cgit From 478a01016c08d33635359254fc1ab61a8c83dd1a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 9 Jul 2024 14:53:14 +0200 Subject: tools/power turbostat: Fix typo in turbostat.8 "After" was missing an "r", nothing to see here. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 71ae2d5159ad..44cbc7cb382d 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -323,7 +323,7 @@ available on all processors. Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. We add a counter showing the 32-bit raw value of MSR 0x199 (MSR_IA32_PERF_CTL), labeling it with the column header, "PRF_CTRL", and display it only once, -afte the conclusion of a 0.1 second sleep. +after the conclusion of a 0.1 second sleep. .nf sudo ./turbostat --quiet --cpu 0-3 --show CPU --add msr0x199,u32,raw,PRF_CTRL sleep .1 0.101604 sec -- cgit From e516211f615fb54ce3429870eefc17469ae289b8 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:56 +0200 Subject: rust: macros: indent list item in `paste!`'s docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new style lint, `doc_lazy_continuation` [1], has been introduced in the upcoming Rust 1.80 (currently in beta), which detects missing indentation in code documentation. We have one such case: error: doc list item missing indentation --> rust/macros/lib.rs:315:5 | 315 | /// default the span of the `[< >]` group is used. | ^ | = help: if this is supposed to be its own paragraph, add a blank line = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_lazy_continuation = note: `-D clippy::doc-lazy-continuation` implied by `-D clippy::style` = help: to override `-D clippy::style` add `#[allow(clippy::doc_lazy_continuation)]` help: indent this line | 315 | /// default the span of the `[< >]` group is used. | ++ While the rendering of the docs by `rustdoc` is not affected, we apply this kind of indentation elsewhere since it looks better. Thus clean it up. Link: https://rust-lang.github.io/rust-clippy/master/index.html#/doc_lazy_continuation [1] Reviewed-by: Björn Roy Baron Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-2-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 6dcc72aff111..159e75292970 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -345,7 +345,7 @@ pub fn pinned_drop(args: TokenStream, input: TokenStream) -> TokenStream { /// /// Currently supported modifiers are: /// * `span`: change the span of concatenated identifier to the span of the specified token. By -/// default the span of the `[< >]` group is used. +/// default the span of the `[< >]` group is used. /// * `lower`: change the identifier to lower case. /// * `upper`: change the identifier to upper case. /// -- cgit From dee1396a486cf2b6e7840322f6d104680649f2ff Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:57 +0200 Subject: rust: init: simplify from `map_err` to `inspect_err` A new complexity lint, `manual_inspect` [1], has been introduced in the upcoming Rust 1.81 (currently in nightly), which checks for uses of `map*` which return the original item: error: --> rust/kernel/init.rs:846:23 | 846 | (self.1)(val).map_err(|e| { | ^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_inspect = note: `-D clippy::manual-inspect` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(clippy::manual_inspect)]` help: try | 846 ~ (self.1)(val).inspect_err(|e| { 847 | // SAFETY: `slot` was initialized above. 848 ~ unsafe { core::ptr::drop_in_place(slot) }; | Thus clean them up. Link: https://rust-lang.github.io/rust-clippy/master/index.html#/manual_inspect [1] Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-3-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/kernel/init.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 68605b633e73..495c09ebe3a3 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -843,11 +843,8 @@ where let val = unsafe { &mut *slot }; // SAFETY: `slot` is considered pinned. let val = unsafe { Pin::new_unchecked(val) }; - (self.1)(val).map_err(|e| { - // SAFETY: `slot` was initialized above. - unsafe { core::ptr::drop_in_place(slot) }; - e - }) + // SAFETY: `slot` was initialized above. + (self.1)(val).inspect_err(|_| unsafe { core::ptr::drop_in_place(slot) }) } } @@ -941,11 +938,9 @@ where // SAFETY: All requirements fulfilled since this function is `__init`. unsafe { self.0.__pinned_init(slot)? }; // SAFETY: The above call initialized `slot` and we still have unique access. - (self.1)(unsafe { &mut *slot }).map_err(|e| { + (self.1)(unsafe { &mut *slot }).inspect_err(|_| // SAFETY: `slot` was initialized above. - unsafe { core::ptr::drop_in_place(slot) }; - e - }) + unsafe { core::ptr::drop_in_place(slot) }) } } -- cgit From f85bea18f71b2817ea45d63c6d1b91f9bc4a811f Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:58 +0200 Subject: rust: allow `dead_code` for never constructed bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting with the upcoming Rust 1.80.0 (since upstream commit 35130d7233e9 ("Detect pub structs never constructed and unused associated constants in traits")), the `dead_code` pass detects more cases, which triggers in the `bindings` crate: warning: struct `boot_params` is never constructed --> rust/bindings/bindings_generated.rs:10684:12 | 10684 | pub struct boot_params { | ^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default As well as in the `uapi` one: warning: struct `boot_params` is never constructed --> rust/uapi/uapi_generated.rs:10392:12 | 10392 | pub struct boot_params { | ^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default These are all expected, since we do not use all the structs in the bindings that `bindgen` generates from the C headers. Therefore, allow them. Reviewed-by: Björn Roy Baron Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-4-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/bindings/lib.rs | 1 + rust/uapi/lib.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 40ddaee50d8b..93a1a3fc97bc 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -24,6 +24,7 @@ unsafe_op_in_unsafe_fn )] +#[allow(dead_code)] mod bindings_raw { // Use glob import here to expose all helpers. // Symbols defined within the module will take precedence to the glob import. diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index 0caad902ba40..80a00260e3e7 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -14,6 +14,7 @@ #![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))] #![allow( clippy::all, + dead_code, missing_docs, non_camel_case_types, non_upper_case_globals, -- cgit From f8f88aa25a03ce1e0fc8a9842840988b870f0c37 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:59 +0200 Subject: rust: relax most deny-level lints to warnings Since we are starting to support several Rust toolchains, lints (including Clippy ones) now may behave differently and lint groups may include new lints. Therefore, to maximize the chances a given version works, relax some deny-level lints to warnings. It may also make our lives a bit easier while developing new code or refactoring. To be clear, the requirements for in-tree code are still the same, since Rust code still needs to be warning-free (patches should be clean under `WERROR=y`) and the set of lints is not changed. `unsafe_op_in_unsafe_fn` is left unmodified, i.e. as an error, since it is becoming the default in the language (warn-by-default in Rust 2024 [1] and ideally an error later on) and thus it should also be very well tested. In addition, it is simple enough that it should not have false positives (unlike e.g. `rust_2018_idioms`'s `explicit_outlives_requirements`). `non_ascii_idents` is left unmodified as well, i.e. as an error, since it is unlikely one gains any productivity during development if it were a warning (in fact, it may be worse, since it is likely one made a typo). In addition, it should not have false positives. Finally, put the two `-D` ones at the top and take the chance to do one per line. Link: https://github.com/rust-lang/rust/pull/112038 [1] Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-5-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Makefile | 24 +++++++++++++----------- rust/Makefile | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index fea263aaa492..7ea526814fdb 100644 --- a/Makefile +++ b/Makefile @@ -461,17 +461,19 @@ KBUILD_USERLDFLAGS := $(USERLDFLAGS) # host programs. export rust_common_flags := --edition=2021 \ -Zbinary_dep_depinfo=y \ - -Dunsafe_op_in_unsafe_fn -Drust_2018_idioms \ - -Dunreachable_pub -Dnon_ascii_idents \ + -Dunsafe_op_in_unsafe_fn \ + -Dnon_ascii_idents \ + -Wrust_2018_idioms \ + -Wunreachable_pub \ -Wmissing_docs \ - -Drustdoc::missing_crate_level_docs \ - -Dclippy::correctness -Dclippy::style \ - -Dclippy::suspicious -Dclippy::complexity \ - -Dclippy::perf \ - -Dclippy::let_unit_value -Dclippy::mut_mut \ - -Dclippy::needless_bitwise_bool \ - -Dclippy::needless_continue \ - -Dclippy::no_mangle_with_rust_abi \ + -Wrustdoc::missing_crate_level_docs \ + -Wclippy::correctness -Wclippy::style \ + -Wclippy::suspicious -Wclippy::complexity \ + -Wclippy::perf \ + -Wclippy::let_unit_value -Wclippy::mut_mut \ + -Wclippy::needless_bitwise_bool \ + -Wclippy::needless_continue \ + -Wclippy::no_mangle_with_rust_abi \ -Wclippy::dbg_macro KBUILD_HOSTCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) @@ -572,7 +574,7 @@ KBUILD_RUSTFLAGS := $(rust_common_flags) \ -Csymbol-mangling-version=v0 \ -Crelocation-model=static \ -Zfunction-sections=n \ - -Dclippy::float_arithmetic + -Wclippy::float_arithmetic KBUILD_AFLAGS_KERNEL := KBUILD_CFLAGS_KERNEL := diff --git a/rust/Makefile b/rust/Makefile index 385378311322..bf05e65365da 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -367,7 +367,7 @@ ifneq ($(or $(CONFIG_ARM64),$(and $(CONFIG_RISCV),$(CONFIG_64BIT))),) endif $(obj)/core.o: private skip_clippy = 1 -$(obj)/core.o: private skip_flags = -Dunreachable_pub +$(obj)/core.o: private skip_flags = -Wunreachable_pub $(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym)) $(obj)/core.o: private rustc_target_flags = $(core-cfgs) $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs FORCE @@ -381,7 +381,7 @@ $(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE +$(call if_changed_dep,rustc_library) $(obj)/alloc.o: private skip_clippy = 1 -$(obj)/alloc.o: private skip_flags = -Dunreachable_pub +$(obj)/alloc.o: private skip_flags = -Wunreachable_pub $(obj)/alloc.o: private rustc_target_flags = $(alloc-cfgs) $(obj)/alloc.o: $(RUST_LIB_SRC)/alloc/src/lib.rs $(obj)/compiler_builtins.o FORCE +$(call if_changed_dep,rustc_library) -- cgit From bb421b517e4b05d1e971c1fbf6af2f241accbf52 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:00 +0200 Subject: rust: simplify Clippy warning flags set All Clippy lint groups that we enable, except `correctness`, have a default `warn` level, thus they may be removed now that we relaxed all lints to `warn`. Moreover, Clippy provides an `all` lint group that covers the groups we enable by default. Thus just use `all` instead -- the only change is that, if Clippy introduces a new lint group or splits an existing one, we will cover that one automatically. In addition, `let_unit_value` is in `style` since Rust 1.62.0, thus it does not need to be enabled manually. Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-6-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 7ea526814fdb..9044fdb9adb1 100644 --- a/Makefile +++ b/Makefile @@ -467,10 +467,8 @@ export rust_common_flags := --edition=2021 \ -Wunreachable_pub \ -Wmissing_docs \ -Wrustdoc::missing_crate_level_docs \ - -Wclippy::correctness -Wclippy::style \ - -Wclippy::suspicious -Wclippy::complexity \ - -Wclippy::perf \ - -Wclippy::let_unit_value -Wclippy::mut_mut \ + -Wclippy::all \ + -Wclippy::mut_mut \ -Wclippy::needless_bitwise_bool \ -Wclippy::needless_continue \ -Wclippy::no_mangle_with_rust_abi \ -- cgit From 63b27f4a0074bc6ef987a44ee9ad8bf960b568c2 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:01 +0200 Subject: rust: start supporting several compiler versions It is time to start supporting several Rust compiler versions and thus establish a minimum Rust version. We may still want to upgrade the minimum sometimes in the beginning since there may be important features coming into the language that improve how we write code (e.g. field projections), which may or may not make sense to support conditionally. We will start with a window of two stable releases, and widen it over time. Thus this patch does not move the current minimum (1.78.0), but instead adds support for the recently released 1.79.0. This should already be enough for kernel developers in distributions that provide recent Rust compiler versions routinely, such as Arch Linux, Debian Unstable (outside the freeze period), Fedora Linux, Gentoo Linux (especially the testing channel), Nix (unstable) and openSUSE Tumbleweed. See the documentation patch about it later in this series. In addition, Rust for Linux is now being built-tested in Rust's pre-merge CI [1]. That is, every change that is attempting to land into the Rust compiler is tested against the kernel, and it is merged only if it passes -- thanks to the Rust project for that! Thus, with the pre-merge CI in place, both projects hope to avoid unintentional changes to Rust that break the kernel. This means that, in general, apart from intentional changes on their side (that we will need to workaround conditionally on our side), the upcoming Rust compiler versions should generally work. For instance, currently, the beta (1.80.0) and nightly (1.81.0) branches work as well. Of course, the Rust for Linux CI job in the Rust toolchain may still need to be temporarily disabled for different reasons, but the intention is to help bring Rust for Linux into stable Rust. Link: https://github.com/rust-lang/rust/pull/125209 [1] Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-7-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/process/changes.rst | 4 +--- Documentation/rust/quick-start.rst | 15 +++++++-------- scripts/rust_is_available.sh | 8 -------- scripts/rust_is_available_test.py | 5 ----- 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 5685d7bfe4d0..0d0b7120792b 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -88,9 +88,7 @@ docs on :ref:`Building Linux with Clang/LLVM `. Rust (optional) --------------- -A particular version of the Rust toolchain is required. Newer versions may or -may not work because the kernel depends on some unstable Rust features, for -the moment. +A recent version of the Rust compiler is required. Each Rust toolchain comes with several "components", some of which are required (like ``rustc``) and some that are optional. The ``rust-src`` component (which diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index ac2f16288458..89bbfde8c96c 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -36,16 +36,15 @@ if that is the case. rustc ***** -A particular version of the Rust compiler is required. Newer versions may or -may not work because, for the moment, the kernel depends on some unstable -Rust features. +A recent version of the Rust compiler is required. If ``rustup`` is being used, enter the kernel build directory (or use -``--path=`` argument to the ``set`` sub-command) and run:: +``--path=`` argument to the ``set`` sub-command) and run, +for instance:: - rustup override set $(scripts/min-tool-version.sh rustc) + rustup override set stable -This will configure your working directory to use the correct version of +This will configure your working directory to use the given version of ``rustc`` without affecting your default toolchain. Note that the override applies to the current working directory (and its @@ -72,9 +71,9 @@ version later on requires re-adding the component. Otherwise, if a standalone installer is used, the Rust source tree may be downloaded into the toolchain's installation folder:: - curl -L "https://static.rust-lang.org/dist/rust-src-$(scripts/min-tool-version.sh rustc).tar.gz" | + curl -L "https://static.rust-lang.org/dist/rust-src-$(rustc --version | cut -d' ' -f2).tar.gz" | tar -xzf - -C "$(rustc --print sysroot)/lib" \ - "rust-src-$(scripts/min-tool-version.sh rustc)/rust-src/lib/" \ + "rust-src-$(rustc --version | cut -d' ' -f2)/rust-src/lib/" \ --strip-components=3 In this case, upgrading the Rust compiler version later on requires manually diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 117018946b57..67cb900124cc 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -117,14 +117,6 @@ if [ "$rust_compiler_cversion" -lt "$rust_compiler_min_cversion" ]; then echo >&2 "***" exit 1 fi -if [ "$rust_compiler_cversion" -gt "$rust_compiler_min_cversion" ]; then - echo >&2 "***" - echo >&2 "*** Rust compiler '$RUSTC' is too new. This may or may not work." - echo >&2 "*** Your version: $rust_compiler_version" - echo >&2 "*** Expected version: $rust_compiler_min_version" - echo >&2 "***" - warning=1 -fi # Check that the Rust bindings generator is suitable. # diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index 57613fe5ed75..a255f79aafc2 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -193,11 +193,6 @@ else: result = self.run_script(self.Expected.FAILURE, { "RUSTC": rustc }) self.assertIn(f"Rust compiler '{rustc}' is too old.", result.stderr) - def test_rustc_new_version(self): - rustc = self.generate_rustc("rustc 1.999.0 (a8314ef7d 2099-06-27)") - result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "RUSTC": rustc }) - self.assertIn(f"Rust compiler '{rustc}' is too new. This may or may not work.", result.stderr) - def test_bindgen_nonexecutable(self): result = self.run_script(self.Expected.FAILURE, { "BINDGEN": self.nonexecutable }) self.assertIn(f"Running '{self.nonexecutable}' to check the Rust bindings generator version failed with", result.stderr) -- cgit From d49082faf6a001019693a837dea7b958048c731c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:02 +0200 Subject: rust: avoid assuming a particular `bindgen` build `bindgen`'s logic to find `libclang` (via `clang-sys`) may change over time, and depends on how it was built (e.g. Linux distributions may decide to build it differently, and we are going to provide documentation on installing it via distributions later in this series). Therefore, clarify that `bindgen` may be built in several ways and simplify the documentation by only mentioning the most prominent environment variable (`LIBCLANG_PATH`) as an example on how to tweak the search of the library at runtime (i.e. when `bindgen` is built as our documentation explains). This also avoids duplicating the documentation, like `bindgen` itself does (i.e. it refers to `clang-sys`). Similarly, replace the test we had for this (which used the real program) with a mocked one, to avoid depending on the particular build as well. Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-8-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 23 +++++++++-------------- scripts/rust_is_available_test.py | 25 +++++++++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 89bbfde8c96c..5ea8c8914942 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -113,20 +113,15 @@ Install it via (note that this will download and build the tool from source):: cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli -``bindgen`` needs to find a suitable ``libclang`` in order to work. If it is -not found (or a different ``libclang`` than the one found should be used), -the process can be tweaked using the environment variables understood by -``clang-sys`` (the Rust bindings crate that ``bindgen`` uses to access -``libclang``): - -* ``LLVM_CONFIG_PATH`` can be pointed to an ``llvm-config`` executable. - -* Or ``LIBCLANG_PATH`` can be pointed to a ``libclang`` shared library - or to the directory containing it. - -* Or ``CLANG_PATH`` can be pointed to a ``clang`` executable. - -For details, please see ``clang-sys``'s documentation at: +``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which +may be linked statically, dynamically or loaded at runtime). By default, the +``cargo`` command above will produce a ``bindgen`` binary that will load +``libclang`` at runtime. If it is not found (or a different ``libclang`` than +the one found should be used), the process can be tweaked, e.g. by using the +``LIBCLANG_PATH`` environment variable. For details, please see ``clang-sys``'s +documentation at: + + https://github.com/KyleMayes/clang-sys#linking https://github.com/KyleMayes/clang-sys#environment-variables diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index a255f79aafc2..0481aab862ec 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -55,10 +55,15 @@ else: @classmethod def generate_bindgen(cls, version_stdout, libclang_stderr): + if libclang_stderr is None: + libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})" + else: + libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)" + return cls.generate_executable(f"""#!/usr/bin/env python3 import sys if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv): - print({repr(libclang_stderr)}, file=sys.stderr) + {libclang_case} else: print({repr(version_stdout)}) """) @@ -67,6 +72,10 @@ else: def generate_bindgen_version(cls, stdout): return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr) + @classmethod + def generate_bindgen_libclang_failure(cls): + return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, None) + @classmethod def generate_bindgen_libclang(cls, stderr): return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, stderr) @@ -89,6 +98,7 @@ else: cls.rust_default_sysroot = subprocess.check_output(("rustc", "--print", "sysroot")).decode().strip() cls.bindgen_default_bindgen_version_stdout = f"bindgen {cls.bindgen_default_version}" + cls.bindgen_default_bindgen_libclang_failure_exit_code = 42 cls.bindgen_default_bindgen_libclang_stderr = f"scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version {cls.llvm_default_version} [-W#pragma-messages], err: false" cls.default_rustc = cls.generate_rustc(f"rustc {cls.rustc_default_version}") @@ -227,15 +237,10 @@ else: self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr) def test_bindgen_libclang_failure(self): - for env in ( - { "LLVM_CONFIG_PATH": self.missing }, - { "LIBCLANG_PATH": self.missing }, - { "CLANG_PATH": self.missing }, - ): - with self.subTest(env=env): - result = self.run_script(self.Expected.FAILURE, env | { "PATH": os.environ["PATH"], "BINDGEN": "bindgen" }) - self.assertIn("Running 'bindgen' to check the libclang version (used by the Rust", result.stderr) - self.assertIn("bindings generator) failed with code ", result.stderr) + bindgen = self.generate_bindgen_libclang_failure() + result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) + self.assertIn(f"Running '{bindgen}' to check the libclang version (used by the Rust", result.stderr) + self.assertIn(f"bindings generator) failed with code {self.bindgen_default_bindgen_libclang_failure_exit_code}. This may be caused by", result.stderr) def test_bindgen_libclang_unexpected_version(self): bindgen = self.generate_bindgen_libclang("scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version unexpected [-W#pragma-messages], err: false") -- cgit From 9e98db17837093cb0f4dcfcc3524739d93249c45 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:03 +0200 Subject: rust: work around `bindgen` 0.69.0 issue `bindgen` 0.69.0 contains a bug: `--version` does not work without providing a header [1]: error: the following required arguments were not provided:
Usage: bindgen
-- ... Thus, in preparation for supporting several `bindgen` versions, work around the issue by passing a dummy argument. Include a comment so that we can remove the workaround in the future. Link: https://github.com/rust-lang/rust-bindgen/pull/2678 [1] Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-9-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- init/Kconfig | 5 ++++- scripts/rust_is_available.sh | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index febdea2afc3b..94e20d3b99d4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1928,7 +1928,10 @@ config RUSTC_VERSION_TEXT config BINDGEN_VERSION_TEXT string depends on RUST - default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n) + # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 + # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when + # the minimum version is upgraded past that (0.69.1 already fixed the issue). + default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version workaround-for-0.69.0 || echo n) # # Place an empty function call at each tracepoint site. Can be diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 67cb900124cc..1881e8f2a2b9 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -121,8 +121,12 @@ fi # Check that the Rust bindings generator is suitable. # # Non-stable and distributions' versions may have a version suffix, e.g. `-dev`. +# +# The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 +# (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when +# the minimum version is upgraded past that (0.69.1 already fixed the issue). rust_bindings_generator_output=$( \ - LC_ALL=C "$BINDGEN" --version 2>/dev/null + LC_ALL=C "$BINDGEN" --version workaround-for-0.69.0 2>/dev/null ) || rust_bindings_generator_code=$? if [ -n "$rust_bindings_generator_code" ]; then echo >&2 "***" -- cgit From c844fa64a2d46982fe75e834f4a46c46d2b3b2e5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:04 +0200 Subject: rust: start supporting several `bindgen` versions With both the workaround for `bindgen` 0.69.0 and the warning about 0.66.0 and 0.66.1 in place, start supporting several `bindgen` versions, like it was done for the Rust compiler in a previous patch. All other versions, including the latest 0.69.4, build without errors. The `bindgen` project, like Rust, has also agreed to have the kernel in their CI [1] -- thanks! This should help both projects: `bindgen` will be able to detect early issues like those mentioned above, and the kernel will be very likely build with new releases (at least for the basic configuration being tested). Link: https://github.com/rust-lang/rust-bindgen/pull/2851 [1] Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-10-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 7 ++++--- scripts/rust_is_available.sh | 8 -------- scripts/rust_is_available_test.py | 5 ----- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 5ea8c8914942..66cefbab8f9a 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -107,11 +107,12 @@ bindgen ******* The bindings to the C side of the kernel are generated at build time using -the ``bindgen`` tool. A particular version is required. +the ``bindgen`` tool. -Install it via (note that this will download and build the tool from source):: +Install it, for instance, via (note that this will download and build the tool +from source):: - cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli + cargo install --locked bindgen-cli ``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which may be linked statically, dynamically or loaded at runtime). By default, the diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 1881e8f2a2b9..4531f9dd19d3 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -161,14 +161,6 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers echo >&2 "***" exit 1 fi -if [ "$rust_bindings_generator_cversion" -gt "$rust_bindings_generator_min_cversion" ]; then - echo >&2 "***" - echo >&2 "*** Rust bindings generator '$BINDGEN' is too new. This may or may not work." - echo >&2 "*** Your version: $rust_bindings_generator_version" - echo >&2 "*** Expected version: $rust_bindings_generator_min_version" - echo >&2 "***" - warning=1 -fi # Check that the `libclang` used by the Rust bindings generator is suitable. # diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index 0481aab862ec..d6d54b7ea42a 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -231,11 +231,6 @@ else: result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr) - def test_bindgen_new_version(self): - bindgen = self.generate_bindgen_version("bindgen 0.999.0") - result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen }) - self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr) - def test_bindgen_libclang_failure(self): bindgen = self.generate_bindgen_libclang_failure() result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) -- cgit From 981ad93c89a3c600dee9795d3ead105acc805483 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:05 +0200 Subject: rust: warn about `bindgen` versions 0.66.0 and 0.66.1 `bindgen` versions 0.66.0 and 0.66.1 panic due to C string literals with NUL characters [1]: panicked at .cargo/registry/src/index.crates.io-6f17d22bba15001f/bindgen-0.66.0/codegen/mod.rs:717:71: called `Result::unwrap()` on an `Err` value: FromBytesWithNulError { kind: InteriorNul(4) } Thus, in preparation for supporting several `bindgen` versions, add a version check to warn the user about it. Since some distributions may have patched it (e.g. Debian did [2]), check if that seems to be the case (after the version check matches), in order to avoid printing a warning in that case. We could make it an error, but 1) it is going to fail anyway later in the build, 2) we would disable `RUST`, which is also painful, 3) someone could have patched it in a way that still makes our extra check fail (however unlikely), 4) the interior NUL may go away in the headers (however unlikely). Thus just warn about it so that users know why it is failing. In addition, add a couple tests for the new cases. Link: https://github.com/rust-lang/rust-bindgen/pull/2567 [1] Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1069047 [2] Link: https://lore.kernel.org/r/20240709160615.998336-11-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/rust_is_available.sh | 13 +++++++++++++ scripts/rust_is_available_bindgen_0_66.h | 2 ++ scripts/rust_is_available_test.py | 26 +++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 scripts/rust_is_available_bindgen_0_66.h diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 4531f9dd19d3..5262c56dd674 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -161,6 +161,19 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers echo >&2 "***" exit 1 fi +if [ "$rust_bindings_generator_cversion" -eq 6600 ] || + [ "$rust_bindings_generator_cversion" -eq 6601 ]; then + # Distributions may have patched the issue (e.g. Debian did). + if ! "$BINDGEN" $(dirname $0)/rust_is_available_bindgen_0_66.h >/dev/null; then + echo >&2 "***" + echo >&2 "*** Rust bindings generator '$BINDGEN' versions 0.66.0 and 0.66.1 may not" + echo >&2 "*** work due to a bug (https://github.com/rust-lang/rust-bindgen/pull/2567)," + echo >&2 "*** unless patched (like Debian's)." + echo >&2 "*** Your version: $rust_bindings_generator_version" + echo >&2 "***" + warning=1 + fi +fi # Check that the `libclang` used by the Rust bindings generator is suitable. # diff --git a/scripts/rust_is_available_bindgen_0_66.h b/scripts/rust_is_available_bindgen_0_66.h new file mode 100644 index 000000000000..c0431293421c --- /dev/null +++ b/scripts/rust_is_available_bindgen_0_66.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define A "\0" diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index d6d54b7ea42a..413741037fb3 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -54,23 +54,30 @@ else: """) @classmethod - def generate_bindgen(cls, version_stdout, libclang_stderr): + def generate_bindgen(cls, version_stdout, libclang_stderr, version_0_66_patched=False): if libclang_stderr is None: libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})" else: libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)" + if version_0_66_patched: + version_0_66_case = "pass" + else: + version_0_66_case = "raise SystemExit(1)" + return cls.generate_executable(f"""#!/usr/bin/env python3 import sys if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv): {libclang_case} +elif "rust_is_available_bindgen_0_66.h" in " ".join(sys.argv): + {version_0_66_case} else: print({repr(version_stdout)}) """) @classmethod - def generate_bindgen_version(cls, stdout): - return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr) + def generate_bindgen_version(cls, stdout, version_0_66_patched=False): + return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr, version_0_66_patched) @classmethod def generate_bindgen_libclang_failure(cls): @@ -231,6 +238,19 @@ else: result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr) + def test_bindgen_bad_version_0_66_0_and_0_66_1(self): + for version in ("0.66.0", "0.66.1"): + with self.subTest(version=version): + bindgen = self.generate_bindgen_version(f"bindgen {version}") + result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen }) + self.assertIn(f"Rust bindings generator '{bindgen}' versions 0.66.0 and 0.66.1 may not", result.stderr) + + def test_bindgen_bad_version_0_66_0_and_0_66_1_patched(self): + for version in ("0.66.0", "0.66.1"): + with self.subTest(version=version): + bindgen = self.generate_bindgen_version(f"bindgen {version}", True) + result = self.run_script(self.Expected.SUCCESS, { "BINDGEN": bindgen }) + def test_bindgen_libclang_failure(self): bindgen = self.generate_bindgen_libclang_failure() result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) -- cgit From b1263411112305acf2af728728591465becb45b0 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:08 +0200 Subject: docs: rust: quick-start: add section on Linux distributions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we are starting to support several Rust compiler and `bindgen` versions, there is a good chance some Linux distributions work out of the box. Thus, provide some instructions on how to set the toolchain up for a few major Linux distributions. This simplifies the setup users need to build the kernel. In addition, add an introduction to the document so that it is easier to understand its structure and move the LLVM+Rust kernel.org toolchains paragraph there (removing "depending on the Linux version"). We may want to reorganize the document or split it in the future, but I wanted to focus this commit on the new information added about each particular distribution. Finally, remove the `rustup`'s components mention in `changes.rst` since users do not need it if they install the toolchain via the distributions (and anyway it was too detailed for that main document). Cc: Jan Alexander Steffens Cc: Johannes Löthberg Cc: Fabian Grünbichler Cc: Josh Stone Cc: Randy Barlow Cc: Anna (navi) Figueiredo Gomes Cc: Matoro Mahri Cc: Ryan Scheel Cc: figsoda Cc: Jörg Thalheim Cc: Theodore Ni <43ngvg@masqt.com> Cc: Winter Cc: William Brown Cc: Xiaoguang Wang Cc: Andrea Righi Cc: Zixing Liu Cc: Nathan Chancellor Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-14-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/process/changes.rst | 5 -- Documentation/rust/quick-start.rst | 93 +++++++++++++++++++++++++++++++++----- 2 files changed, 81 insertions(+), 17 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 0d0b7120792b..0ce96ae2588c 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -90,11 +90,6 @@ Rust (optional) A recent version of the Rust compiler is required. -Each Rust toolchain comes with several "components", some of which are required -(like ``rustc``) and some that are optional. The ``rust-src`` component (which -is optional) needs to be installed to build the kernel. Other components are -useful for developing. - Please see Documentation/rust/quick-start.rst for instructions on how to satisfy the build requirements of Rust support. In particular, the ``Makefile`` target ``rustavailable`` is useful to check why the Rust toolchain may not diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 66cefbab8f9a..d06a36106cd4 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -5,24 +5,93 @@ Quick Start This document describes how to get started with kernel development in Rust. +There are a few ways to install a Rust toolchain needed for kernel development. +A simple way is to use the packages from your Linux distribution if they are +suitable -- the first section below explains this approach. An advantage of this +approach is that, typically, the distribution will match the LLVM used by Rust +and Clang. + +Another way is using the prebuilt stable versions of LLVM+Rust provided on +`kernel.org `_. These are the same slim +and fast LLVM toolchains from :ref:`Getting LLVM ` with versions +of Rust added to them that Rust for Linux supports. Two sets are provided: the +"latest LLVM" and "matching LLVM" (please see the link for more information). + +Alternatively, the next two "Requirements" sections explain each component and +how to install them through ``rustup``, the standalone installers from Rust +and/or building them. + +The rest of the document explains other aspects on how to get started. + + +Distributions +------------- + +Arch Linux +********** + +Arch Linux provides recent Rust releases and thus it should generally work out +of the box, e.g.:: + + pacman -S rust rust-src rust-bindgen + + +Debian +****** + +Debian Unstable (Sid), outside of the freeze period, provides recent Rust +releases and thus it should generally work out of the box, e.g.:: + + apt install rustc rust-src bindgen rustfmt rust-clippy + + +Fedora Linux +************ + +Fedora Linux provides recent Rust releases and thus it should generally work out +of the box, e.g.:: + + dnf install rust rust-src bindgen-cli rustfmt clippy + + +Gentoo Linux +************ + +Gentoo Linux (and especially the testing branch) provides recent Rust releases +and thus it should generally work out of the box, e.g.:: + + USE='rust-src rustfmt clippy' emerge dev-lang/rust dev-util/bindgen + +``LIBCLANG_PATH`` may need to be set. + + +Nix +*** + +Nix (unstable channel) provides recent Rust releases and thus it should +generally work out of the box, e.g.:: + + { pkgs ? import {} }: + pkgs.mkShell { + nativeBuildInputs = with pkgs; [ rustc rust-bindgen rustfmt clippy ]; + RUST_LIB_SRC = "${pkgs.rust.packages.stable.rustPlatform.rustLibSrc}"; + } + + +openSUSE +******** + +openSUSE Slowroll and openSUSE Tumbleweed provide recent Rust releases and thus +they should generally work out of the box, e.g.:: + + zypper install rust rust1.79-src rust-bindgen clang + Requirements: Building ---------------------- This section explains how to fetch the tools needed for building. -Some of these requirements might be available from Linux distributions -under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However, -at the time of writing, they are likely not to be recent enough unless -the distribution tracks the latest releases. - -Prebuilt stable versions of LLVM+Rust are provided on `kernel.org -`_. These are the same slim and fast -LLVM toolchains from :ref:`Getting LLVM ` with versions of Rust -added to them that Rust for Linux supports, depending on the Linux version. Two -sets are provided: the "latest LLVM" and "matching LLVM" (please see the link -for more information). - To easily check whether the requirements are met, the following target can be used:: -- cgit From 692a68ee9c3c4ab984ae45a74a7569f14222d5aa Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 10 Jul 2024 17:27:01 +0000 Subject: radix tree test suite: put definition of bitmap_clear() into lib/bitmap.c In tools/ directory, function bitmap_clear() is currently only used in object file tools/testing/radix-tree/xarray.o. But instead of keeping a bitmap.c with only bitmap_clear() definition in radix-tree's own directory, it would be more proper to put it in common directory lib/. Sync the kernel definition and link some related libs, no functional change is expected. Signed-off-by: Wei Yang CC: Matthew Wilcox CC: Yury Norov Signed-off-by: Yury Norov --- tools/include/linux/bitmap.h | 17 ++++++++++++++++- tools/lib/bitmap.c | 20 ++++++++++++++++++++ tools/testing/radix-tree/Makefile | 4 ++-- tools/testing/radix-tree/bitmap.c | 23 ----------------------- 4 files changed, 38 insertions(+), 26 deletions(-) delete mode 100644 tools/testing/radix-tree/bitmap.c diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 210c13b1b857..2a7f260ef9dc 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -19,7 +19,7 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); -void bitmap_clear(unsigned long *map, unsigned int start, int len); +void __bitmap_clear(unsigned long *map, unsigned int start, int len); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); @@ -150,4 +150,19 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } +static inline void bitmap_clear(unsigned long *map, unsigned int start, + unsigned int nbits) +{ + if (__builtin_constant_p(nbits) && nbits == 1) + __clear_bit(start, map); + else if (small_const_nbits(start + nbits)) + *map &= ~GENMASK(start + nbits - 1, start); + else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && + IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && + __builtin_constant_p(nbits & BITMAP_MEM_MASK) && + IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) + memset((char *)map + start / 8, 0, nbits / 8); + else + __bitmap_clear(map, start, nbits); +} #endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index c3e4871967bc..2178862bb114 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -100,3 +100,23 @@ bool __bitmap_intersects(const unsigned long *bitmap1, return true; return false; } + +void __bitmap_clear(unsigned long *map, unsigned int start, int len) +{ + unsigned long *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 7527f738b4a1..d1acd7d58850 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -5,8 +5,8 @@ CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \ LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu TARGETS = main idr-test multiorder xarray maple -CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \ - slab.o maple.o +LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o +CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS) OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \ iteration_check_2.o benchmark.o diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c deleted file mode 100644 index 66ec4a24a203..000000000000 --- a/tools/testing/radix-tree/bitmap.c +++ /dev/null @@ -1,23 +0,0 @@ -/* lib/bitmap.c pulls in at least two other files. */ - -#include - -void bitmap_clear(unsigned long *map, unsigned int start, int len) -{ - unsigned long *p = map + BIT_WORD(start); - const unsigned int size = start + len; - int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); - unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - - while (len - bits_to_clear >= 0) { - *p &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_LONG; - mask_to_clear = ~0UL; - p++; - } - if (len) { - mask_to_clear &= BITMAP_LAST_WORD_MASK(size); - *p &= ~mask_to_clear; - } -} -- cgit From fb9086e95ad84f14e4f4db97ed96422c74407830 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 10 Jul 2024 22:24:18 +0200 Subject: riscv: Remove unnecessary int cast in variable_fls() __builtin_clz() returns an int and casting the whole expression to int is unnecessary. Remove it. Signed-off-by: Thorsten Blum Signed-off-by: Yury Norov --- arch/riscv/include/asm/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 880606b0469a..71af9ecfcfcb 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -170,7 +170,7 @@ legacy: ({ \ typeof(x) x_ = (x); \ __builtin_constant_p(x_) ? \ - (int)((x_ != 0) ? (32 - __builtin_clz(x_)) : 0) \ + ((x_ != 0) ? (32 - __builtin_clz(x_)) : 0) \ : \ variable_fls(x_); \ }) -- cgit From 8ecef8e01a08c7e3e4ffc8f08d9f9663984f334b Mon Sep 17 00:00:00 2001 From: peng guo Date: Wed, 10 Jul 2024 10:31:12 +0800 Subject: cxl/core: Fix incorrect vendor debug UUID define When user send a mbox command whose opcode is CXL_MBOX_OP_CLEAR_LOG and the in_payload is normal vendor debug log UUID according to the CXL specification cxl_payload_from_user_allowed() will return false unexpectedly, Sending mbox cmd operation fails and the kernel log will print: Clear Log: input payload not allowed. All CXL devices that support a debug log shall support the Vendor Debug Log to allow the log to be accessed through a common host driver, for any device, all versions of the CXL specification define the same value with Log Identifier of: 5e1819d9-11a9-400c-811f-d60719403d86 Refer to CXL spec r3.1 Table 8-71 Fix the definition value of DEFINE_CXL_VENDOR_DEBUG_UUID to match the CXL specification. Fixes: 472b1ce6e9d6 ("cxl/mem: Enable commands via CEL") Signed-off-by: peng guo Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240710023112.8063-1-engguopeng@buaa.edu.cn Signed-off-by: Dave Jiang --- drivers/cxl/cxlmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 19aba81cdf13..f55141c6d64e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -562,7 +562,7 @@ enum cxl_opcode { 0x3b, 0x3f, 0x17) #define DEFINE_CXL_VENDOR_DEBUG_UUID \ - UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ + UUID_INIT(0x5e1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ 0x40, 0x3d, 0x86) struct cxl_mbox_get_supported_logs { -- cgit From bebfbbaffccfb69126c5a6a1d41cd868b6419370 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 28 Jun 2024 19:48:07 +0200 Subject: cxl/acpi: Warn on mixed CXL VH and RCH/RCD Hierarchy Each Host Bridge instance has a corresponding CXL Host Bridge Structure (CHBS) ACPI table that identifies its capabilities. CHBS tables can be two types (CXL 3.1 Table 9-21): The PCIe Root Complex Register Block (RCRB) and CXL Host Bridge Component Registers (CHBCR). If a Host Bridge is attached to a device that is operating in Restricted CXL Device Mode (RCD), BIOS publishes an RCRB with the base address of registers that describe its capabilities (CXL 3.1 sec. 9.11). Instead, the new (CXL 2.0+) Component registers can only be accessed by means of a base address published with a CHBCR (CXL 3.1 sec. 9.12). If an eRCD (a device that forces the host-bridge into CXL 1.1 Restricted CXL Host mode) is attached to a CXL 2.0+ Host-Bridge, the current CXL specification does not define a mechanism for finding CXL-2.0-only root-port component registers like HDM decoders and Extended Security capability. An algorithm to locate a CHBCR associated with an RCRB, would be too invasive to land without some concrete motivation. Therefore, just print a message to inform of unsupported config. Count how many different CHBS "Version" types are detected by cxl_get_chbs_iter(). Then make cxl_get_chbs() print a warning if that sum is greater than 1. Tested-by: Alison Schofield Signed-off-by: Fabio M. De Francesco Reviewed-by: Davidlohr Bueso Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240628175535.272472-1-fabio.m.de.francesco@linux.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index e51315ea4a6a..574918a9ae3a 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -482,6 +482,8 @@ struct cxl_chbs_context { unsigned long long uid; resource_size_t base; u32 cxl_version; + int nr_versions; + u32 saved_version; }; static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg, @@ -490,22 +492,31 @@ static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg, struct cxl_chbs_context *ctx = arg; struct acpi_cedt_chbs *chbs; - if (ctx->base != CXL_RESOURCE_NONE) - return 0; - chbs = (struct acpi_cedt_chbs *) header; - if (ctx->uid != chbs->uid) + if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 && + chbs->length != CXL_RCRB_SIZE) return 0; - ctx->cxl_version = chbs->cxl_version; if (!chbs->base) return 0; - if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 && - chbs->length != CXL_RCRB_SIZE) + if (ctx->saved_version != chbs->cxl_version) { + /* + * cxl_version cannot be overwritten before the next two + * checks, then use saved_version + */ + ctx->saved_version = chbs->cxl_version; + ctx->nr_versions++; + } + + if (ctx->base != CXL_RESOURCE_NONE) + return 0; + + if (ctx->uid != chbs->uid) return 0; + ctx->cxl_version = chbs->cxl_version; ctx->base = chbs->base; return 0; @@ -529,10 +540,19 @@ static int cxl_get_chbs(struct device *dev, struct acpi_device *hb, .uid = uid, .base = CXL_RESOURCE_NONE, .cxl_version = UINT_MAX, + .saved_version = UINT_MAX, }; acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbs_iter, ctx); + if (ctx->nr_versions > 1) { + /* + * Disclaim eRCD support given some component register may + * only be found via CHBCR + */ + dev_info(dev, "Unsupported platform config, mixed Virtual Host and Restricted CXL Host hierarchy."); + } + return 0; } -- cgit From 591209c79844c1ecbee79e6b5a019e5b61eab8d3 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Sat, 6 Jul 2024 18:53:43 -0700 Subject: cxl/memdev: Replace ENXIO with EBUSY for inject poison limit reached The CXL driver provides a debugfs interface offering users the ability to inject and clear poison to a memdev. Once a user has injected up to the devices limit further injection requests fail with ENXIO until a clear poison is issued. Users may not have device specs in hand or may want to intentionally hit the limit and then clear. Replace the usual ENXIO return status with EBUSY so users can recognize this failure. Signed-off-by: Alison Schofield Tested-by: Xingtao Yao Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/825bd4c67fb55a4373c4182d999ad49d4e6b4fe7.1720316188.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- Documentation/ABI/testing/debugfs-cxl | 7 ++++--- drivers/cxl/cxlmem.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl index c61f9b813973..12488c14be64 100644 --- a/Documentation/ABI/testing/debugfs-cxl +++ b/Documentation/ABI/testing/debugfs-cxl @@ -14,9 +14,10 @@ Description: event to its internal Informational Event log, updates the Event Status register, and if configured, interrupts the host. It is not an error to inject poison into an address that - already has poison present and no error is returned. The - inject_poison attribute is only visible for devices supporting - the capability. + already has poison present and no error is returned. If the + device returns 'Inject Poison Limit Reached' an -EBUSY error + is returned to the user. The inject_poison attribute is only + visible for devices supporting the capability. What: /sys/kernel/debug/memX/clear_poison diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index f55141c6d64e..40a465c37e03 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -161,7 +161,7 @@ struct cxl_mbox_cmd { C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"), \ C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"), \ C(PADDR, -EFAULT, "physical address specified is invalid"), \ - C(POISONLMT, -ENXIO, "poison injection limit has been reached"), \ + C(POISONLMT, -EBUSY, "poison injection limit has been reached"), \ C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"), \ C(ABORT, -ENXIO, "background cmd was aborted by device"), \ C(SECURITY, -ENXIO, "not valid in the current security state"), \ -- cgit From 3a8617c7df6eb351227aad9b0df647f34a7ef423 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Sat, 6 Jul 2024 18:53:44 -0700 Subject: cxl/test: Replace ENXIO with EBUSY for inject poison limit reached The CXL driver was recently updated to return EBUSY rather than ENXIO when the device reports that an injection request exceeds the device's limit. That change to EBUSY allows debug users to differentiate between limit reached and inject failures for any other reason. Change cxl-test to also return EBUSY and tidy up the dev_dbg() messaging to emit the correct limit. Reminder: the cxl-test per device injection limit is a configurable attribute: /sys/bus/platform/drivers/cxl_mock_mem/poison_inject_max Signed-off-by: Alison Schofield Tested-by: Xingtao Yao Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/ba1b80e1658b644d85d0d5e2287112d00a48b9cf.1720316188.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/mem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 94de2cf60f4f..129f179b0ac5 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1135,27 +1135,28 @@ static bool mock_poison_dev_max_injected(struct cxl_dev_state *cxlds) return (count >= poison_inject_dev_max); } -static bool mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) +static int mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) { + /* Return EBUSY to match the CXL driver handling */ if (mock_poison_dev_max_injected(cxlds)) { dev_dbg(cxlds->dev, "Device poison injection limit has been reached: %d\n", - MOCK_INJECT_DEV_MAX); - return false; + poison_inject_dev_max); + return -EBUSY; } for (int i = 0; i < MOCK_INJECT_TEST_MAX; i++) { if (!mock_poison_list[i].cxlds) { mock_poison_list[i].cxlds = cxlds; mock_poison_list[i].dpa = dpa; - return true; + return 0; } } dev_dbg(cxlds->dev, "Mock test poison injection limit has been reached: %d\n", MOCK_INJECT_TEST_MAX); - return false; + return -ENXIO; } static bool mock_poison_found(struct cxl_dev_state *cxlds, u64 dpa) @@ -1179,10 +1180,8 @@ static int mock_inject_poison(struct cxl_dev_state *cxlds, dev_dbg(cxlds->dev, "DPA: 0x%llx already poisoned\n", dpa); return 0; } - if (!mock_poison_add(cxlds, dpa)) - return -ENXIO; - return 0; + return mock_poison_add(cxlds, dpa); } static bool mock_poison_del(struct cxl_dev_state *cxlds, u64 dpa) -- cgit From 9aa5f6235e16ac6fceed789a30e72addf1abd7d8 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:49 -0700 Subject: cxl/core: Fold cxl_trace_hpa() into cxl_dpa_to_hpa() Although cxl_trace_hpa() is used to populate TRACE EVENTs with HPA addresses the work it performs is a DPA to HPA translation not a trace. Tidy up this naming by moving the minimal work done in cxl_trace_hpa() into cxl_dpa_to_hpa() and use cxl_dpa_to_hpa() for trace event callbacks. Suggested-by: Dan Williams Signed-off-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Robert Richter Link: https://patch.msgid.link/452a9b0c525b774c72d9d5851515ffa928750132.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 8 ++++---- drivers/cxl/core/mbox.c | 2 +- drivers/cxl/core/region.c | 33 +++++++++++++-------------------- drivers/cxl/core/trace.h | 4 ++-- 4 files changed, 20 insertions(+), 27 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 625394486459..72a506c9dbd0 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -28,12 +28,12 @@ int cxl_region_init(void); void cxl_region_exit(void); int cxl_get_poison_by_endpoint(struct cxl_port *port); struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa); -u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa); +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa); #else -static inline u64 -cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) +static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, + const struct cxl_memdev *cxlmd, u64 dpa) { return ULLONG_MAX; } diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 2626f3fff201..eb0b08e5136f 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -878,7 +878,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); if (cxlr) - hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); + hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa); if (event_type == CXL_CPER_EVENT_GEN_MEDIA) trace_cxl_general_media(cxlmd, type, cxlr, hpa, diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3c2b6144be23..237c28d5f2cc 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2749,15 +2749,25 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) return false; } -static u64 cxl_dpa_to_hpa(u64 dpa, struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled) +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) { u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; struct cxl_region_params *p = &cxlr->params; - int pos = cxled->pos; + struct cxl_endpoint_decoder *cxled = NULL; u16 eig = 0; u8 eiw = 0; + int pos; + + for (int i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + if (cxlmd == cxled_to_memdev(cxled)) + break; + } + if (!cxled || cxlmd != cxled_to_memdev(cxled)) + return ULLONG_MAX; + pos = cxled->pos; ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); @@ -2797,23 +2807,6 @@ static u64 cxl_dpa_to_hpa(u64 dpa, struct cxl_region *cxlr, return hpa; } -u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa) -{ - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = NULL; - - for (int i = 0; i < p->nr_targets; i++) { - cxled = p->targets[i]; - if (cxlmd == cxled_to_memdev(cxled)) - break; - } - if (!cxled || cxlmd != cxled_to_memdev(cxled)) - return ULLONG_MAX; - - return cxl_dpa_to_hpa(dpa, cxlr, cxled); -} - static struct lock_class_key cxl_pmem_region_key; static int cxl_pmem_region_alloc(struct cxl_region *cxlr) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index ee5cd4eb2f16..21b76e9c5c60 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -704,8 +704,8 @@ TRACE_EVENT(cxl_poison, if (cxlr) { __assign_str(region); memcpy(__entry->uuid, &cxlr->params.uuid, 16); - __entry->hpa = cxl_trace_hpa(cxlr, cxlmd, - __entry->dpa); + __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd, + __entry->dpa); } else { __assign_str(region); memset(__entry->uuid, 0, 16); -- cgit From 3b2fedcd75e3991e77c2a8c3ebcab0ea68b2d69d Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:50 -0700 Subject: cxl: Restore XOR'd position bits during address translation When a device reports a DPA in events like poison, general_media, and dram, the driver translates that DPA back to an HPA. Presently, the CXL driver translation only considers the Modulo position and will report the wrong HPA for XOR configured root decoders. Add a helper function that restores the XOR'd bits during DPA->HPA address translation. Plumb a root decoder callback to the new helper when XOR interleave arithmetic is in use. For Modulo arithmetic, just let the callback be NULL - as in no extra work required. Upon completion of a DPA->HPA translation a couple of checks are performed on the result. One simply confirms that the calculated HPA is within the address range of the region. That test is useful for both Modulo and XOR interleave arithmetic decodes. A second check confirms that the HPA is within an expected chunk based on the endpoints position in the region and the region granularity. An XOR decode disrupts the Modulo pattern making the chunk check useless. To align the checks with the proper decode, pull the region range check inline and use the helper to do the chunk check for Modulo decodes only. A cxl-test unit test is posted for upstream review here: https://lore.kernel.org/20240624210644.495563-1-alison.schofield@intel.com/ Fixes: 28a3ae4ff66c ("cxl/trace: Add an HPA to cxl_poison trace events") Signed-off-by: Alison Schofield Tested-by: Diego Garcia Rodriguez Reviewed-by: Dan Williams Link: https://patch.msgid.link/1a1ac880d9f889bd6384e657e810431b9a0a72e5.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 40 ++++++++++++++++++++++++++++++++++++++++ drivers/cxl/core/region.c | 23 ++++++++++++++--------- drivers/cxl/cxl.h | 3 +++ 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 571069863c62..6b6ae9c81368 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -74,6 +74,43 @@ static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos) return cxlrd->cxlsd.target[n]; } +static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) +{ + struct cxl_cxims_data *cximsd = cxlrd->platform_data; + int hbiw = cxlrd->cxlsd.nr_targets; + u64 val; + int pos; + + /* No xormaps for host bridge interleave ways of 1 or 3 */ + if (hbiw == 1 || hbiw == 3) + return hpa; + + /* + * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) restore + * the position bit to its value before the xormap was applied at + * HPA->DPA translation. + * + * pos is the lowest set bit in an XORMAP + * val is the XORALLBITS(HPA & XORMAP) + * + * XORALLBITS: The CXL spec (3.1 Table 9-22) defines XORALLBITS + * as an operation that outputs a single bit by XORing all the + * bits in the input (hpa & xormap). Implement XORALLBITS using + * hweight64(). If the hamming weight is even the XOR of those + * bits results in val==0, if odd the XOR result is val==1. + */ + + for (int i = 0; i < cximsd->nr_maps; i++) { + if (!cximsd->xormaps[i]) + continue; + pos = __ffs(cximsd->xormaps[i]); + val = (hweight64(hpa & cximsd->xormaps[i]) & 1); + hpa = (hpa & ~(1ULL << pos)) | (val << pos); + } + + return hpa; +} + struct cxl_cxims_context { struct device *dev; struct cxl_root_decoder *cxlrd; @@ -434,6 +471,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, cxlrd->qos_class = cfmws->qtg_id; + if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) + cxlrd->hpa_to_spa = cxl_xor_hpa_to_spa; + rc = cxl_decoder_add(cxld, target_map); if (rc) return rc; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 237c28d5f2cc..23abd0f7b856 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2723,20 +2723,13 @@ struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) return ctx.cxlr; } -static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) +static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) { struct cxl_region_params *p = &cxlr->params; int gran = p->interleave_granularity; int ways = p->interleave_ways; u64 offset; - /* Is the hpa within this region at all */ - if (hpa < p->res->start || hpa > p->res->end) { - dev_dbg(&cxlr->dev, - "Addr trans fail: hpa 0x%llx not in region\n", hpa); - return false; - } - /* Is the hpa in an expected chunk for its pos(-ition) */ offset = hpa - p->res->start; offset = do_div(offset, gran * ways); @@ -2752,6 +2745,7 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; @@ -2801,7 +2795,18 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start; - if (!cxl_is_hpa_in_range(hpa, cxlr, cxled->pos)) + /* Root decoder translation overrides typical modulo decode */ + if (cxlrd->hpa_to_spa) + hpa = cxlrd->hpa_to_spa(cxlrd, hpa); + + if (hpa < p->res->start || hpa > p->res->end) { + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in region\n", hpa); + return ULLONG_MAX; + } + + /* Simple chunk check, by pos & gran, only applies to modulo decodes */ + if (!cxlrd->hpa_to_spa && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) return ULLONG_MAX; return hpa; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 603c0120cff8..dfc4f27d195c 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -434,12 +434,14 @@ struct cxl_switch_decoder { struct cxl_root_decoder; typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd, int pos); +typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations * @region_id: region id for next region provisioning event * @calc_hb: which host bridge covers the n'th position by granularity + * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address * @platform_data: platform specific configuration data * @range_lock: sync region autodiscovery by address range * @qos_class: QoS performance class cookie @@ -449,6 +451,7 @@ struct cxl_root_decoder { struct resource *res; atomic_t region_id; cxl_calc_hb_fn calc_hb; + cxl_hpa_to_spa_fn hpa_to_spa; void *platform_data; struct mutex range_lock; int qos_class; -- cgit From 82a3e3a235633aa0575fac9507d648dd80f3437f Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:51 -0700 Subject: cxl/region: Verify target positions using the ordered target list When a root decoder is configured the interleave target list is read from the BIOS populated CFMWS structure. Per the CXL spec 3.1 Table 9-22 the target list is in interleave order. The CXL driver populates its decoder target list in the same order and stores it in 'struct cxl_switch_decoder' field "@target: active ordered target list in current decoder configuration" Given the promise of an ordered list, the driver can stop duplicating the work of BIOS and simply check target positions against the ordered list during region configuration. The simplified check against the ordered list is presented here. A follow-on patch will remove the unused code. For Modulo arithmetic this is not a fix, only a simplification. For XOR arithmetic this is a fix for HB IW of 3,6,12. Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)") Signed-off-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/35d08d3aba08fee0f9b86ab1cef0c25116ca8a55.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 23abd0f7b856..2772828ca6ca 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1559,10 +1559,13 @@ static int cxl_region_attach_position(struct cxl_region *cxlr, const struct cxl_dport *dport, int pos) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw = cxld->interleave_ways; struct cxl_port *iter; int rc; - if (cxlrd->calc_hb(cxlrd, pos) != dport) { + if (dport != cxlrd->cxlsd.target[pos % iw]) { dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), dev_name(&cxlrd->cxlsd.cxld.dev)); -- cgit From 8f55ada796565ce801418bf579f31a6a522d0337 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:52 -0700 Subject: cxl: Remove defunct code calculating host bridge target positions The CXL Spec 3.1 Table 9-22 requires that the BIOS populate the CFMWS target list in interleave target order. This means the calculations the CXL driver added to determine positions when XOR math is in use, along with the entire XOR vs Modulo call back setup is not needed. A prior patch added a common method to verify positions. Remove the now unused code related to the cxl_calc_hb_fn. Signed-off-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/2e2c32a2d0f1007e920b58712d15edad2e48d857.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 60 ++----------------------------------------------- drivers/cxl/core/port.c | 20 +---------------- drivers/cxl/cxl.h | 8 +------ 3 files changed, 4 insertions(+), 84 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 6b6ae9c81368..921cee3bb980 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -22,57 +22,6 @@ static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); -/* - * Find a targets entry (n) in the host bridge interleave list. - * CXL Specification 3.0 Table 9-22 - */ -static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw, - int ig) -{ - int i = 0, n = 0; - u8 eiw; - - /* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */ - if (iw != 3) { - for (i = 0; i < cximsd->nr_maps; i++) - n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i; - } - /* IW: 3,6,12 add a modulo calculation to 'n' */ - if (!is_power_of_2(iw)) { - if (ways_to_eiw(iw, &eiw)) - return -1; - hpa &= GENMASK_ULL(51, eiw + ig); - n |= do_div(hpa, 3) << i; - } - return n; -} - -static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos) -{ - struct cxl_cxims_data *cximsd = cxlrd->platform_data; - struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; - struct cxl_decoder *cxld = &cxlsd->cxld; - int ig = cxld->interleave_granularity; - int iw = cxld->interleave_ways; - int n = 0; - u64 hpa; - - if (dev_WARN_ONCE(&cxld->dev, - cxld->interleave_ways != cxlsd->nr_targets, - "misconfigured root decoder\n")) - return NULL; - - hpa = cxlrd->res->start + pos * ig; - - /* Entry (n) is 0 for no interleave (iw == 1) */ - if (iw != 1) - n = cxl_xor_calc_n(hpa, cximsd, iw, ig); - - if (n < 0) - return NULL; - - return cxlrd->cxlsd.target[n]; -} static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) { @@ -398,7 +347,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, struct cxl_port *root_port = ctx->root_port; struct cxl_cxims_context cxims_ctx; struct device *dev = ctx->dev; - cxl_calc_hb_fn cxl_calc_hb; struct cxl_decoder *cxld; unsigned int ways, i, ig; int rc; @@ -426,13 +374,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, if (rc) return rc; - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) - cxl_calc_hb = cxl_hb_modulo; - else - cxl_calc_hb = cxl_hb_xor; - struct cxl_root_decoder *cxlrd __free(put_cxlrd) = - cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); + cxl_root_decoder_alloc(root_port, ways); + if (IS_ERR(cxlrd)) return PTR_ERR(cxlrd); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 887ed6e358fb..bc9e18d02598 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1733,21 +1733,6 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, return 0; } -struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos) -{ - struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; - struct cxl_decoder *cxld = &cxlsd->cxld; - int iw; - - iw = cxld->interleave_ways; - if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets, - "misconfigured root decoder\n")) - return NULL; - - return cxlrd->cxlsd.target[pos % iw]; -} -EXPORT_SYMBOL_NS_GPL(cxl_hb_modulo, CXL); - static struct lock_class_key cxl_decoder_key; /** @@ -1807,7 +1792,6 @@ static int cxl_switch_decoder_init(struct cxl_port *port, * cxl_root_decoder_alloc - Allocate a root level decoder * @port: owning CXL root of this decoder * @nr_targets: static number of downstream targets - * @calc_hb: which host bridge covers the n'th position by granularity * * Return: A new cxl decoder to be registered by cxl_decoder_add(). A * 'CXL root' decoder is one that decodes from a top-level / static platform @@ -1815,8 +1799,7 @@ static int cxl_switch_decoder_init(struct cxl_port *port, * topology. */ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets, - cxl_calc_hb_fn calc_hb) + unsigned int nr_targets) { struct cxl_root_decoder *cxlrd; struct cxl_switch_decoder *cxlsd; @@ -1838,7 +1821,6 @@ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, return ERR_PTR(rc); } - cxlrd->calc_hb = calc_hb; mutex_init(&cxlrd->range_lock); cxld = &cxlsd->cxld; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index dfc4f27d195c..e27a0cd4f107 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -432,15 +432,12 @@ struct cxl_switch_decoder { }; struct cxl_root_decoder; -typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd, - int pos); typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations * @region_id: region id for next region provisioning event - * @calc_hb: which host bridge covers the n'th position by granularity * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address * @platform_data: platform specific configuration data * @range_lock: sync region autodiscovery by address range @@ -450,7 +447,6 @@ typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); struct cxl_root_decoder { struct resource *res; atomic_t region_id; - cxl_calc_hb_fn calc_hb; cxl_hpa_to_spa_fn hpa_to_spa; void *platform_data; struct mutex range_lock; @@ -775,9 +771,7 @@ bool is_root_decoder(struct device *dev); bool is_switch_decoder(struct device *dev); bool is_endpoint_decoder(struct device *dev); struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets, - cxl_calc_hb_fn calc_hb); -struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos); + unsigned int nr_targets); struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); -- cgit From 745d9f4a31defec731119ee8aad8ba9f2536dd9a Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 29 Feb 2024 23:42:36 +0300 Subject: ubi: eba: properly rollback inside self_check_eba In case of a memory allocation failure in the volumes loop we can only process the already allocated scan_eba and fm_eba array elements on the error path - others are still uninitialized. Found by Linux Verification Center (linuxtesting.org). Fixes: 00abf3041590 ("UBI: Add self_check_eba()") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/eba.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index e5ac3cd0bbae..c7ba7a15c9f7 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1564,6 +1564,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap, GFP_KERNEL); if (!fm_eba[i]) { ret = -ENOMEM; + kfree(scan_eba[i]); goto out_free; } @@ -1599,7 +1600,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap, } out_free: - for (i = 0; i < num_volumes; i++) { + while (--i >= 0) { if (!ubi->volumes[i]) continue; -- cgit From 299af26eb46374295a3289be990e536b75c9376a Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:35:38 -0300 Subject: mtd: ubi: make ubi_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the ubi_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Zhihao Cheng Reviewed-by: Miquel Raynal Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 2 +- drivers/mtd/ubi/ubi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index a7e3a6246c0e..952c80269f57 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -112,7 +112,7 @@ static struct attribute *ubi_class_attrs[] = { ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '//class/ubi/') */ -struct class ubi_class = { +const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 32009a24869e..1aead55090d0 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -814,7 +814,7 @@ extern struct kmem_cache *ubi_wl_entry_slab; extern const struct file_operations ubi_ctrl_cdev_operations; extern const struct file_operations ubi_cdev_operations; extern const struct file_operations ubi_vol_cdev_operations; -extern struct class ubi_class; +extern const struct class ubi_class; extern struct mutex ubi_devices_mutex; extern struct blocking_notifier_head ubi_notifiers; -- cgit From 02096a0cf150fc8dc1cfe39afcd768d06b70e722 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 Mar 2024 09:46:52 +0100 Subject: mtd: ubi: avoid expensive do_div() on 32-bit machines The use of do_div() in ubi_nvmem_reg_read() makes calling it on 32-bit machines rather expensive. Since the 'from' variable is known to be a 32-bit quantity, it is clearly never needed and can be optimized into a regular division operation. Fixes: b8a77b9a5f9c ("mtd: ubi: fix NVMEM over UBI volumes on 32-bit systems") Fixes: 3ce485803da1 ("mtd: ubi: provide NVMEM layer over UBI volumes") Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/nvmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c index 8aeb9c428e51..a94a1a9aaec1 100644 --- a/drivers/mtd/ubi/nvmem.c +++ b/drivers/mtd/ubi/nvmem.c @@ -6,7 +6,6 @@ /* UBI NVMEM provider */ #include "ubi.h" #include -#include /* List of all NVMEM devices */ static LIST_HEAD(nvmem_devices); @@ -27,14 +26,15 @@ static int ubi_nvmem_reg_read(void *priv, unsigned int from, struct ubi_nvmem *unv = priv; struct ubi_volume_desc *desc; uint32_t offs; - uint64_t lnum = from; + uint32_t lnum; int err = 0; desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); if (IS_ERR(desc)) return PTR_ERR(desc); - offs = do_div(lnum, unv->usable_leb_size); + offs = from % unv->usable_leb_size; + lnum = from / unv->usable_leb_size; while (bytes_left) { to_read = unv->usable_leb_size - offs; -- cgit From 02eb1846b77d2896903e94b966ba632cf48d39e0 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:43 +0800 Subject: ubifs: Fix unattached xattr inode if powercut happens after deleting When powercut happens after deleting file, the xattr inode could be alone existing in TNC but its' xattr entry cannot be found in TNC. File inode and xattr inode are added into orphan list after deleting file, file inode's nlink is 0 but xattr inode's nlink is not 0 (PS: zero nlink xattr inode is written on disk in evicting process by ubifs_jnl_write_inode). So, following process could happen: 1. touch file 2. setxattr(file) 3. unlink file // inode(nlink=0), xattr inode(nlink=1) are added into orphan list 4. commit // write inode inum and xattr inum into orphan area 5. powercut 6. mount do_kill_orphans // inode(nlink=0) is deleted from TNC by ubifs_tnc_remove_range, // xattr entry is deleted too. // xattr inode(nlink=1) is not deleted from TNC Finally we could see following error while debugging UBIFS: UBIFS error (ubi0:0 pid 1093): dbg_check_filesystem [ubifs]: inode 66 nlink is 1, but calculated nlink is 0 UBIFS (ubi0:0): dump of the inode 66 sitting in LEB 12:2128 node_type 0 (inode node) group_type 1 (in node group) len 197 key (66, inode) size 37 nlink 1 flags 0x20 xattr_cnt 0 xattr_size 0 xattr_names 0 data len 37 Fix it by removing entire inode with it's xattrs while replaying orphan, just replace function ubifs_tnc_remove_range by ubifs_tnc_remove_ino. Fixes: ee1438ce5dc4 ("ubifs: Check link count of inodes when killing orphans.") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218661 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 4909321d84cf..ddeb125e6930 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -691,12 +691,12 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; for (i = 0; i < n; i++) { - union ubifs_key key1, key2; + union ubifs_key key; inum = le64_to_cpu(orph->inos[i]); - ino_key_init(c, &key1, inum); - err = ubifs_tnc_lookup(c, &key1, ino); + ino_key_init(c, &key, inum); + err = ubifs_tnc_lookup(c, &key, ino); if (err && err != -ENOENT) goto out_free; @@ -708,10 +708,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, dbg_rcvry("deleting orphaned inode %lu", (unsigned long)inum); - lowest_ino_key(c, &key1, inum); - highest_ino_key(c, &key2, inum); - - err = ubifs_tnc_remove_range(c, &key1, &key2); + err = ubifs_tnc_remove_ino(c, inum); if (err) goto out_ro; } -- cgit From 354c1796639a743b49bd13549ec0ec6ca121b85e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:44 +0800 Subject: ubifs: Don't add xattr inode into orphan area Now, the entire inode with its' xattrs are removed while replaying orphan nodes. There is no need to add xattr inodes into orphan area, which is based on the fact that xattr entries won't be cleared from disk before deleting xattr inodes, in another words, current logic can make sure that xattr inode be deleted in any cases even UBIFS not record xattr inode into orphan area. Let's looking for possible paths that could clear xattr entries from disk but leave the xattr inode on TNC: 1. unlink/tmpfile -> ubifs_jnl_update: inode(nlink=0) is written into bud LEB and added into orphan list, then: a. powercut: ubifs_tnc_remove_ino(xattr entry/inode can be found from TNC and being deleted) is invoked in replaying journal. b. commit + powercut: inode is written into orphan area, and ubifs_tnc_remove_ino is invoked in replaying orphan nodes. c. evicting + powercut: xattr inode(nlink=0) is written on disk, xattr is removed from TNC, gc could clear xattr entries from disk. ubifs_tnc_remove_ino will apply on inode and xattr inode in replaying journal, so lost xattr entries will make no influence. d. evicting + commit + powercut: xattr inode/entry are removed from index tree(on disk) by ubifs_jnl_write_inode, xattr inode is cleared from orphan area by ubifs_jnl_write_inode + commit. e. commit + evicting + powercut: inode is written into orphan area, then equivalent to c. 2. remove xattr -> ubifs_jnl_delete_xattr: xattr entry(inum=0) and xattr inode(nlink=0) is written into bud LEB, xattr entry/inode are removed from TNC, then: a. powercut: gc could clear xattr entries from disk, which won't affect deleting xattr entry from TNC. ubifs_tnc_remove_ino will apply on xattr inode in replaying journal, ubifs_tnc_remove_nm will apply on xattr entry in replaying journal. b. commit + powercut: xattr entry/inode are removed from index tree (on disk). Tracking xattr inode in orphan list is imported by commit 988bec41318f3f ("ubifs: orphan: Handle xattrs like files"), it aims to fix the similar problem described in commit 7959cf3a7506d4a ("ubifs: journal: Handle xattrs like files"). Actually, the problem only exist in journal case but not the orphan case. So, we can remove the orphan tracking for xattr inodes. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 85 +++++++++---------------------------------------------- fs/ubifs/ubifs.h | 3 -- 2 files changed, 14 insertions(+), 74 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index ddeb125e6930..88fbf331ad8c 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -42,24 +42,30 @@ static int dbg_check_orphans(struct ubifs_info *c); -static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, - struct ubifs_orphan *parent_orphan) +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) { struct ubifs_orphan *orphan, *o; struct rb_node **p, *parent = NULL; orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); if (!orphan) - return ERR_PTR(-ENOMEM); + return -ENOMEM; orphan->inum = inum; orphan->new = 1; - INIT_LIST_HEAD(&orphan->child_list); spin_lock(&c->orphan_lock); if (c->tot_orphans >= c->max_orphans) { spin_unlock(&c->orphan_lock); kfree(orphan); - return ERR_PTR(-ENFILE); + return -ENFILE; } p = &c->orph_tree.rb_node; while (*p) { @@ -73,7 +79,7 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, ubifs_err(c, "orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); - return ERR_PTR(-EINVAL); + return -EINVAL; } } c->tot_orphans += 1; @@ -83,14 +89,9 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, list_add_tail(&orphan->list, &c->orph_list); list_add_tail(&orphan->new_list, &c->orph_new); - if (parent_orphan) { - list_add_tail(&orphan->child_list, - &parent_orphan->child_list); - } - spin_unlock(&c->orphan_lock); dbg_gen("ino %lu", (unsigned long)inum); - return orphan; + return 0; } static struct ubifs_orphan *lookup_orphan(struct ubifs_info *c, ino_t inum) @@ -144,59 +145,6 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) __orphan_drop(c, orph); } -/** - * ubifs_add_orphan - add an orphan. - * @c: UBIFS file-system description object - * @inum: orphan inode number - * - * Add an orphan. This function is called when an inodes link count drops to - * zero. - */ -int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) -{ - int err = 0; - ino_t xattr_inum; - union ubifs_key key; - struct ubifs_dent_node *xent, *pxent = NULL; - struct fscrypt_name nm = {0}; - struct ubifs_orphan *xattr_orphan; - struct ubifs_orphan *orphan; - - orphan = orphan_add(c, inum, NULL); - if (IS_ERR(orphan)) - return PTR_ERR(orphan); - - lowest_xent_key(c, &key, inum); - while (1) { - xent = ubifs_tnc_next_ent(c, &key, &nm); - if (IS_ERR(xent)) { - err = PTR_ERR(xent); - if (err == -ENOENT) - break; - kfree(pxent); - return err; - } - - fname_name(&nm) = xent->name; - fname_len(&nm) = le16_to_cpu(xent->nlen); - xattr_inum = le64_to_cpu(xent->inum); - - xattr_orphan = orphan_add(c, xattr_inum, orphan); - if (IS_ERR(xattr_orphan)) { - kfree(pxent); - kfree(xent); - return PTR_ERR(xattr_orphan); - } - - kfree(pxent); - pxent = xent; - key_read(c, &xent->key, &key); - } - kfree(pxent); - - return 0; -} - /** * ubifs_delete_orphan - delete an orphan. * @c: UBIFS file-system description object @@ -206,7 +154,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) */ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) { - struct ubifs_orphan *orph, *child_orph, *tmp_o; + struct ubifs_orphan *orph; spin_lock(&c->orphan_lock); @@ -219,11 +167,6 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) return; } - list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) { - list_del(&child_orph->child_list); - orphan_delete(c, child_orph); - } - orphan_delete(c, orph); spin_unlock(&c->orphan_lock); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 1f3ea879d93a..821c6ae99e62 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -923,8 +923,6 @@ struct ubifs_budget_req { * @rb: rb-tree node of rb-tree of orphans sorted by inode number * @list: list head of list of orphans in order added * @new_list: list head of list of orphans added since the last commit - * @child_list: list of xattr children if this orphan hosts xattrs, list head - * if this orphan is a xattr, not used otherwise. * @cnext: next orphan to commit * @dnext: next orphan to delete * @inum: inode number @@ -936,7 +934,6 @@ struct ubifs_orphan { struct rb_node rb; struct list_head list; struct list_head new_list; - struct list_head child_list; struct ubifs_orphan *cnext; struct ubifs_orphan *dnext; ino_t inum; -- cgit From 7bed61a1cf166b5c113047fc8f60ff22dcb04893 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:45 +0800 Subject: Revert "ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path" This reverts commit 6379b44cdcd67f5f5d986b73953e99700591edfa. Commit 1e022216dcd2 ("ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path") is applied again in commit 6379b44cdcd6 ("ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path"), which changed ubifs_mknod (It won't become a real problem). Just revert it. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index eac0fef801f1..551148de66cd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1133,8 +1133,6 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: - /* Free inode->i_link before inode is marked as bad. */ - fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: -- cgit From 6376d7503b02796cc6109198d555ddebfb4f5f81 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:46 +0800 Subject: ubifs: Remove insert_dead_orphan from replaying orphan process UBIFS will do commit at the end of mounting process(rw mode), dead orphans(added by insert_dead_orphan in replaying orphan) are deleted by ubifs_orphan_end_commit(). The only reason why dead orphans are added into orphan list is that old orpans may be lost when powercut happens in ubifs_orphan_end_commit(): ubifs_orphan_end_commit // TNC(updated by orphans) is not written yet if (c->cmt_orphans != 0) commit_orphans consolidate // traverse orphan list write_orph_nodes // rewrite all orphans by ubifs_leb_change // If dead orphans are not in list, they will be lost when powercut // happens, then TNC won't be updated by old orphans in next mounting. Luckily, the condition 'c->cmt_orphans != 0' will never be true in mounting process, there can't be new orphans added into orphan list before mounting returned, but commit will be done at the end of mounting. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 88fbf331ad8c..6e843e8fc3db 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -513,51 +513,6 @@ int ubifs_clear_orphans(struct ubifs_info *c) return 0; } -/** - * insert_dead_orphan - insert an orphan. - * @c: UBIFS file-system description object - * @inum: orphan inode number - * - * This function is a helper to the 'do_kill_orphans()' function. The orphan - * must be kept until the next commit, so it is added to the rb-tree and the - * deletion list. - */ -static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) -{ - struct ubifs_orphan *orphan, *o; - struct rb_node **p, *parent = NULL; - - orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); - if (!orphan) - return -ENOMEM; - orphan->inum = inum; - - p = &c->orph_tree.rb_node; - while (*p) { - parent = *p; - o = rb_entry(parent, struct ubifs_orphan, rb); - if (inum < o->inum) - p = &(*p)->rb_left; - else if (inum > o->inum) - p = &(*p)->rb_right; - else { - /* Already added - no problem */ - kfree(orphan); - return 0; - } - } - c->tot_orphans += 1; - rb_link_node(&orphan->rb, parent, p); - rb_insert_color(&orphan->rb, &c->orph_tree); - list_add_tail(&orphan->list, &c->orph_list); - orphan->del = 1; - orphan->dnext = c->orph_dnext; - c->orph_dnext = orphan; - dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, - c->new_orphans, c->tot_orphans); - return 0; -} - /** * do_kill_orphans - remove orphan inodes from the index. * @c: UBIFS file-system description object @@ -655,10 +610,6 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (err) goto out_ro; } - - err = insert_dead_orphan(c, inum); - if (err) - goto out_free; } *last_cmt_no = cmt_no; -- cgit From 7efc34b53b3950ad4fc98b5d25210bae80ab0fde Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:47 +0800 Subject: ubifs: Fix adding orphan entry twice for the same inode The tmpfile could be added into orphan list twice, first time is creation, the second time is removing after it is linked. The orphan entry could be added twice for tmpfile if following sequence is satisfied: ubifs_tmpfile ubifs_jnl_update ubifs_add_orphan // first time to add orphan entry P1 P2 ubifs_link do_commit ubifs_orphan_start_commit orphan->cmt = 1 ubifs_delete_orphan orphan_delete if (orph->cmt) orph->del = 1; // orphan entry is not deleted from tree return ubifs_unlink ubifs_jnl_update ubifs_add_orphan orphan_add // found old orphan entry, second time to add orphan entry ubifs_err(c, "orphaned twice") return -EINVAL // unlink failed! ubifs_orphan_end_commit erase_deleted // delete old orphan entry rb_erase(&orphan->rb, &c->orph_tree) Fix it by removing orphan entry from orphan tree in advance, rather than remove it from orphan tree in committing process. Fixes: 32fe905c17f0 ("ubifs: Fix O_TMPFILE corner case in ubifs_link()") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218672 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 6e843e8fc3db..37d206097112 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -136,6 +136,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) if (orph->cmt) { orph->del = 1; + rb_erase(&orph->rb, &c->orph_tree); orph->dnext = c->orph_dnext; c->orph_dnext = orph; dbg_gen("delete later ino %lu", (unsigned long)orph->inum); @@ -461,7 +462,6 @@ static void erase_deleted(struct ubifs_info *c) dnext = orphan->dnext; ubifs_assert(c, !orphan->new); ubifs_assert(c, orphan->del); - rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); -- cgit From 9f5ecacfcee2bda21b100399a9130248d55c2a0c Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:48 +0800 Subject: ubifs: Move ui->data initialization after initializing security Host inode and its' xattr will be written on disk after initializing security when creating symlink or dev, then the host inode and its dentry will be written again in ubifs_jnl_update. There is no need to write inode data in the security initialization pass, just move the ui->data initialization after initializing security. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 551148de66cd..848988f2b7dc 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1102,16 +1102,18 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out_fname; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) { + kfree(dev); + goto out_inode; + } + init_special_inode(inode, inode->i_mode, rdev); inode->i_size = ubifs_inode(inode)->ui_size = devlen; ui = ubifs_inode(inode); ui->data = dev; ui->data_len = devlen; - err = ubifs_init_security(dir, inode, &dentry->d_name); - if (err) - goto out_inode; - mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; @@ -1184,6 +1186,10 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out_fname; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + ui = ubifs_inode(inode); ui->data = kmalloc(disk_link.len, GFP_NOFS); if (!ui->data) { @@ -1209,10 +1215,6 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, ui->data_len = disk_link.len - 1; inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; - err = ubifs_init_security(dir, inode, &dentry->d_name); - if (err) - goto out_inode; - mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; -- cgit From b25e6a5f78e84598ec62552ef342a58a23b6f7c4 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:49 +0800 Subject: ubifs: Fix space leak when powercut happens in linking tmpfile There is a potential space leak problem when powercut happens in linking tmpfile, in which case, inode node (with nlink=0) and its' data nodes can be found from tnc (on flash), but there are no dentries related to the inode, so the file is invisible but takes free space. Detailed process is shown as: ubifs_tmpfile ubifs_jnl_update // Add bud A into log area ubifs_add_orphan // Add inode into orphan list P1 P2 ubifs_link ubifs_delete_orphan // Delete inode from orphan list, then inode won't // be written into orphan area, there is no chance // to delete inode by replaying orphan. commit // bud A won't be replayed in next mounting >> powercut << ubifs_jnl_update // Link inode to dentry The root cause is that orphan entry deletion and journal writing(for link) are interrupted by commit, which makes the two operations are not atomic. Fix it by doing ubifs_delete_orphan under the protection of c->commit_sem within ubifs_jnl_update. This is also a preparation to support all creating new files by orphan inode. v1 is https://lore.kernel.org/linux-mtd/20200701093227.674945-1-chengzhihao1@huawei.com/ Fixes: 32fe905c17f0 ("ubifs: Fix O_TMPFILE corner case in ubifs_link()") Link: https://bugzilla.kernel.org/show_bug.cgi?id=208405 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 22 ++++++++-------------- fs/ubifs/journal.c | 6 +++++- fs/ubifs/ubifs.h | 2 +- fs/ubifs/xattr.c | 2 +- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 848988f2b7dc..fe16443243ab 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -325,7 +325,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -479,7 +479,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&ui->ui_mutex); lock_2_inodes(dir, inode); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -760,10 +760,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, lock_2_inodes(dir, inode); - /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */ - if (inode->i_nlink == 0) - ubifs_delete_orphan(c, inode->i_ino); - inc_nlink(inode); ihold(inode); inode_set_ctime_current(inode); @@ -771,7 +767,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, inode->i_nlink == 1); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -785,8 +781,6 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; drop_nlink(inode); - if (inode->i_nlink == 0) - ubifs_add_orphan(c, inode->i_ino); unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); @@ -846,7 +840,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -950,7 +944,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -1025,7 +1019,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -1119,7 +1113,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1220,7 +1214,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 74aee92433d7..2b4b05c2d9b2 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -643,6 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) * @inode: inode to update * @deletion: indicates a directory entry deletion i.e unlink or rmdir * @xent: non-zero if the directory entry is an extended attribute entry + * @delete_orphan: indicates an orphan entry deletion for @inode * * This function updates an inode by writing a directory entry (or extended * attribute entry), the inode itself, and the parent directory inode (or the @@ -664,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent) + int deletion, int xent, int delete_orphan) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0; int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); @@ -806,6 +807,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_ro; + if (delete_orphan) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); spin_lock(&ui->ui_lock); ui->synced_i_size = ui->ui_size; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 821c6ae99e62..d0ed78345617 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1800,7 +1800,7 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent); + int deletion, int xent, int delete_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 0847db521984..f734588b224a 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -149,7 +149,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) host_ui->flags |= UBIFS_CRYPT_FL; - err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + err = ubifs_jnl_update(c, host, nm, inode, 0, 1, 0); if (err) goto out_cancel; ubifs_set_inode_flags(host); -- cgit From 3af2d3a8c56fe7dc24f60c4df0ab85b7ac941902 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:50 +0800 Subject: ubifs: Fix unattached inode when powercut happens in creating For selinux or encryption scenarios, UBIFS could become inconsistent while creating new files in powercut case. Encryption/selinux related xattrs will be created before creating file dentry, which makes creation process is not atomic, details are shown as: Encryption case: ubifs_create ubifs_new_inode fscrypt_set_context ubifs_xattr_set create_xattr ubifs_jnl_update // Disk: xentry xinode inode(LAST_OF_NODE_GROUP) >> power cut << ubifs_jnl_update // Disk: dentry inode parent_inode(LAST_OF_NODE_GROUP) Selinux case: ubifs_create ubifs_new_inode ubifs_init_security security_inode_init_security ubifs_xattr_set create_xattr ubifs_jnl_update // Disk: xentry xinode inode(LAST_OF_NODE_GROUP) >> power cut << ubifs_jnl_update // Disk: dentry inode parent_inode(LAST_OF_NODE_GROUP) Above process will make chk_fs failed in next mounting: UBIFS error (ubi0:0 pid 7995): dbg_check_filesystem [ubifs]: inode 66 nlink is 1, but calculated nlink is 0 Fix it by allocating orphan inode for each non-xattr file creation, then removing orphan list in journal writing process, which ensures that both xattr and dentry be effective in atomic when powercut happens. Fixes: d7f0b70d30ff ("UBIFS: Add security.* XATTR support for the UBIFS") Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218309 Suggested-by: Zhang Yi Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 59 ++++++++++++++++++++++++++++++++++++------------------ fs/ubifs/journal.c | 14 ++++++++----- fs/ubifs/ubifs.h | 4 ++-- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index fe16443243ab..c77ea57fe696 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -71,8 +71,13 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and - * initializes it. Returns new inode in case of success and an error code in - * case of failure. + * initializes it. Non-xattr new inode may be written with xattrs(selinux/ + * encryption) before writing dentry, which could cause inconsistent problem + * when powercut happens between two operations. To deal with it, non-xattr + * new inode is initialized with zero-nlink and added into orphan list, caller + * should make sure that inode is relinked later, and make sure that orphan + * removing and journal writing into an committing atomic operation. Returns + * new inode in case of success and an error code in case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode, bool is_xattr) @@ -163,9 +168,25 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, ui->creat_sqnum = ++c->max_sqnum; spin_unlock(&c->cnt_lock); + if (!is_xattr) { + set_nlink(inode, 0); + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + ubifs_err(c, "ubifs_add_orphan failed: %i", err); + goto out_iput; + } + down_read(&c->commit_sem); + ui->del_cmtno = c->cmt_no; + up_read(&c->commit_sem); + } + if (encrypted) { err = fscrypt_set_context(inode, NULL); if (err) { + if (!is_xattr) { + set_nlink(inode, 1); + ubifs_delete_orphan(c, inode->i_ino); + } ubifs_err(c, "fscrypt_set_context failed: %i", err); goto out_iput; } @@ -320,12 +341,13 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -340,8 +362,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -386,7 +408,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) return inode; out_inode: - make_bad_inode(inode); iput(inode); out_free: ubifs_err(c, "cannot create whiteout file, error %d", err); @@ -470,6 +491,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); d_tmpfile(file, inode); @@ -479,7 +501,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&ui->ui_mutex); lock_2_inodes(dir, inode); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 1); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -492,7 +514,6 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, out_cancel: unlock_2_inodes(dir, inode); out_inode: - make_bad_inode(inode); if (!instantiated) iput(inode); out_budg: @@ -1011,6 +1032,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); insert_inode_hash(inode); inc_nlink(inode); @@ -1019,7 +1041,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -1036,8 +1058,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; drop_nlink(dir); mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1107,13 +1129,14 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, ui = ubifs_inode(inode); ui->data = dev; ui->data_len = devlen; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1128,8 +1151,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1208,13 +1231,14 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, */ ui->data_len = disk_link.len - 1; inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1228,10 +1252,10 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: /* Free inode->i_link before inode is marked as bad. */ fscrypt_free_inode(inode); - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1399,14 +1423,10 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, */ err = ubifs_budget_space(c, &wht_req); if (err) { - /* - * Whiteout inode can not be written on flash by - * ubifs_jnl_write_inode(), because it's neither - * dirty nor zero-nlink. - */ iput(whiteout); goto out_release; } + set_nlink(whiteout, 1); /* Add the old_dentry size to the old_dir size. */ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); @@ -1485,7 +1505,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, - new_inode, &new_nm, whiteout, sync); + new_inode, &new_nm, whiteout, sync, !!whiteout); if (err) goto out_cancel; @@ -1538,6 +1558,7 @@ out_cancel: unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { ubifs_release_budget(c, &wht_req); + set_nlink(whiteout, 0); iput(whiteout); } out_release: diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 2b4b05c2d9b2..517f5de76b49 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -643,7 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) * @inode: inode to update * @deletion: indicates a directory entry deletion i.e unlink or rmdir * @xent: non-zero if the directory entry is an extended attribute entry - * @delete_orphan: indicates an orphan entry deletion for @inode + * @in_orphan: indicates whether the @inode is in orphan list * * This function updates an inode by writing a directory entry (or extended * attribute entry), the inode itself, and the parent directory inode (or the @@ -665,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent, int delete_orphan) + int deletion, int xent, int in_orphan) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0; int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); @@ -751,7 +751,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_release; - if (last_reference) { + if (last_reference && !in_orphan) { err = ubifs_add_orphan(c, inode->i_ino); if (err) { release_head(c, BASEHD); @@ -807,7 +807,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_ro; - if (delete_orphan) + if (in_orphan && inode->i_nlink) ubifs_delete_orphan(c, inode->i_ino); finish_reservation(c); @@ -1340,6 +1340,7 @@ out_free: * @new_nm: new name of the new directory entry * @whiteout: whiteout inode * @sync: non-zero if the write-buffer has to be synchronized + * @delete_orphan: indicates an orphan entry deletion for @whiteout * * This function implements the re-name operation which may involve writing up * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) @@ -1352,7 +1353,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *new_dir, const struct inode *new_inode, const struct fscrypt_name *new_nm, - const struct inode *whiteout, int sync) + const struct inode *whiteout, int sync, int delete_orphan) { void *p; union ubifs_key key; @@ -1569,6 +1570,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, goto out_ro; } + if (delete_orphan) + ubifs_delete_orphan(c, whiteout->i_ino); + finish_reservation(c); if (new_inode) { mark_inode_clean(c, new_ui); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d0ed78345617..b2c6554fa857 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1800,7 +1800,7 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent, int delete_orphan); + int deletion, int xent, int in_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); @@ -1817,7 +1817,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *new_dir, const struct inode *new_inode, const struct fscrypt_name *new_nm, - const struct inode *whiteout, int sync); + const struct inode *whiteout, int sync, int delete_orphan); int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, loff_t old_size, loff_t new_size); int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, -- cgit From 06776df740660e27c351d89f314d05b2720ba41f Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:51 +0800 Subject: ubifs: dbg_orphan_check: Fix missed key type checking When selinux/encryption is enabled, xattr entry node is added into TNC before host inode when creating new file. So it is possible to find xattr entry without host inode from TNC. Orphan debug checking is called by ubifs_orphan_end_commit(), at that time, the commit semaphore is already unlock, so the new creation won't be blocked. Fixes: d7f0b70d30ff ("UBIFS: Add security.* XATTR support for the UBIFS") Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 37d206097112..fb957d963ba6 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -816,8 +816,12 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, inum = key_inum(c, &zbr->key); if (inum != ci->last_ino) { - /* Lowest node type is the inode node, so it comes first */ - if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + /* + * Lowest node type is the inode node or xattr entry(when + * selinux/encryption is enabled), so it comes first + */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY && + key_type(c, &zbr->key) != UBIFS_XENT_KEY) ubifs_err(c, "found orphan node ino %lu, type %d", (unsigned long)inum, key_type(c, &zbr->key)); ci->last_ino = inum; -- cgit From 72f3d3daddd740f744a24cd7ef8c27bd0cd5489d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 11 Apr 2024 00:42:41 +0200 Subject: mtd: ubi: Restore missing cleanup on ubi_init() failure path We need to clean-up debugfs and ubiblock if we fail after initialising them. Signed-off-by: Ben Hutchings Fixes: 927c145208b0 ("mtd: ubi: attach from device tree") Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 952c80269f57..30be4ed68fad 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1372,7 +1372,7 @@ static int __init ubi_init(void) /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) - goto out_slab; + goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); @@ -1387,6 +1387,9 @@ static int __init ubi_init(void) out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); + ubiblock_exit(); +out_debugfs: + ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: -- cgit From 7037c96d8c42965eb93e6b96cf921399c283d431 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Thu, 18 Apr 2024 15:07:04 +0800 Subject: ubifs: correct UBIFS_DFS_DIR_LEN macro definition and improve code clarity The UBIFS_DFS_DIR_LEN macro, which defines the maximum length of the UBIFS debugfs directory name, has an incorrect formula and misleading comments. The current formula is (3 + 1 + 2*2 + 1), which assumes that both UBI device number and volume ID are limited to 2 characters. However, UBI device number ranges from 0 to 31 (2 characters), and volume ID ranges from 0 to 127 (up to 3 characters). Although the current code works due to the cancellation of mathematical errors (9 + 1 = 10, which matches the correct UBIFS_DFS_DIR_LEN value), it can lead to confusion and potential issues in the future. This patch aims to improve the code clarity and maintainability by making the following changes: 1. Corrects the UBIFS_DFS_DIR_LEN macro definition to (3 + 1 + 2 + 3 + 1), accommodating the maximum lengths of both UBI device number and volume ID, plus the separators and null terminator. 2. Updates the snprintf calls to use UBIFS_DFS_DIR_LEN instead of UBIFS_DFS_DIR_LEN + 1, removing the unnecessary +1. 3. Modifies the error checks to compare against UBIFS_DFS_DIR_LEN using >= instead of >, aligning with the corrected macro definition. 4. Removes the redundant +1 in the dfs_dir_name array definitions in ubi.h and debug.h. While these changes do not affect the runtime behavior, they make the code more readable, maintainable, and less prone to future errors. v2->v3: - Removes the duplicated UBIFS_DFS_DIR_LEN and UBIFS_DFS_DIR_NAME macro definitions in ubifs.h, as they are already defined in debug.h. Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/debug.c | 4 ++-- drivers/mtd/ubi/ubi.h | 2 +- fs/ubifs/debug.c | 4 ++-- fs/ubifs/debug.h | 7 ++++--- fs/ubifs/sysfs.c | 6 +++--- fs/ubifs/ubifs.h | 7 ------- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index d57f52bd2ff3..9ec3b8b6a0aa 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -598,9 +598,9 @@ int ubi_debugfs_init_dev(struct ubi_device *ubi) if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; - n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME, + n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN, UBI_DFS_DIR_NAME, ubi->ubi_num); - if (n > UBI_DFS_DIR_LEN) { + if (n >= UBI_DFS_DIR_LEN) { /* The array size is too small */ return -EINVAL; } diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 1aead55090d0..1c9e874e8ede 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -420,7 +420,7 @@ struct ubi_debug_info { unsigned int power_cut_min; unsigned int power_cut_max; unsigned int emulate_failures; - char dfs_dir_name[UBI_DFS_DIR_LEN + 1]; + char dfs_dir_name[UBI_DFS_DIR_LEN]; struct dentry *dfs_dir; struct dentry *dfs_chk_gen; struct dentry *dfs_chk_io; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index ac77ac1fd73e..d91cec93d968 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2827,9 +2827,9 @@ void dbg_debugfs_init_fs(struct ubifs_info *c) const char *fname; struct ubifs_debug_info *d = c->dbg; - n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n > UBIFS_DFS_DIR_LEN) { + if (n >= UBIFS_DFS_DIR_LEN) { /* The array size is too small */ return; } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index ed966108da80..d425861e6b82 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -19,10 +19,11 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, /* * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" - * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + * + 1 for "_" and 2 for UBI device numbers and 3 for volume number and 1 for + * the trailing zero byte. */ #define UBIFS_DFS_DIR_NAME "ubi%d_%d" -#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2 + 3 + 1) /** * ubifs_debug_info - per-FS debugging information. @@ -103,7 +104,7 @@ struct ubifs_debug_info { unsigned int chk_fs:1; unsigned int tst_rcvry:1; - char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; + char dfs_dir_name[UBIFS_DFS_DIR_LEN]; struct dentry *dfs_dir; struct dentry *dfs_dump_lprops; struct dentry *dfs_dump_budg; diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c index 1c958148bb87..aae32222f11b 100644 --- a/fs/ubifs/sysfs.c +++ b/fs/ubifs/sysfs.c @@ -91,17 +91,17 @@ static struct kset ubifs_kset = { int ubifs_sysfs_register(struct ubifs_info *c) { int ret, n; - char dfs_dir_name[UBIFS_DFS_DIR_LEN+1]; + char dfs_dir_name[UBIFS_DFS_DIR_LEN]; c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL); if (!c->stats) { ret = -ENOMEM; goto out_last; } - n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n > UBIFS_DFS_DIR_LEN) { + if (n >= UBIFS_DFS_DIR_LEN) { /* The array size is too small */ ret = -EINVAL; goto out_free; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index b2c6554fa857..d69a5a42d693 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -157,13 +157,6 @@ #define UBIFS_HMAC_ARR_SZ 0 #endif -/* - * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi" - * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. - */ -#define UBIFS_DFS_DIR_NAME "ubi%d_%d" -#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) - /* * Lockdep classes for UBIFS inode @ui_mutex. */ -- cgit From 39986148bc2ab4436ec169c8f4db76f6cc83705d Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 2 May 2024 13:16:55 -0700 Subject: ubifs: fix kernel-doc warnings make C=1 reports the following kernel-doc warnings: fs/ubifs/compress.c:103: warning: Function parameter or struct member 'c' not described in 'ubifs_compress' fs/ubifs/compress.c:155: warning: Function parameter or struct member 'c' not described in 'ubifs_decompress' fs/ubifs/find.c:353: warning: Excess function parameter 'data' description in 'scan_for_free_cb' fs/ubifs/find.c:353: warning: Function parameter or struct member 'arg' not described in 'scan_for_free_cb' fs/ubifs/find.c:594: warning: Excess function parameter 'data' description in 'scan_for_idx_cb' fs/ubifs/find.c:594: warning: Function parameter or struct member 'arg' not described in 'scan_for_idx_cb' fs/ubifs/find.c:786: warning: Excess function parameter 'data' description in 'scan_dirty_idx_cb' fs/ubifs/find.c:786: warning: Function parameter or struct member 'arg' not described in 'scan_dirty_idx_cb' fs/ubifs/find.c:86: warning: Excess function parameter 'data' description in 'scan_for_dirty_cb' fs/ubifs/find.c:86: warning: Function parameter or struct member 'arg' not described in 'scan_for_dirty_cb' fs/ubifs/journal.c:369: warning: expecting prototype for wake_up_reservation(). Prototype was for add_or_start_queue() instead fs/ubifs/lprops.c:1018: warning: Excess function parameter 'lst' description in 'scan_check_cb' fs/ubifs/lprops.c:1018: warning: Function parameter or struct member 'arg' not described in 'scan_check_cb' fs/ubifs/lpt.c:1938: warning: Function parameter or struct member 'ptr' not described in 'lpt_scan_node' fs/ubifs/replay.c:60: warning: Function parameter or struct member 'hash' not described in 'replay_entry' Fix them. Signed-off-by: Jeff Johnson Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/compress.c | 2 ++ fs/ubifs/find.c | 8 ++++---- fs/ubifs/journal.c | 2 +- fs/ubifs/lprops.c | 2 +- fs/ubifs/lpt.c | 1 + fs/ubifs/replay.c | 1 + 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 75461777c466..0b48cbab8a3d 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -82,6 +82,7 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /** * ubifs_compress - compress data. + * @c: UBIFS file-system description object * @in_buf: data to compress * @in_len: length of the data to compress * @out_buf: output buffer where compressed data should be stored @@ -140,6 +141,7 @@ no_compr: /** * ubifs_decompress - decompress data. + * @c: UBIFS file-system description object * @in_buf: data to decompress * @in_len: length of the data to decompress * @out_buf: output buffer where decompressed data should diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 6ebf3c04ac5f..643718906b9f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -73,7 +73,7 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -340,7 +340,7 @@ out: * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -581,7 +581,7 @@ out: * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -773,7 +773,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 517f5de76b49..4a35f9e8f668 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -359,7 +359,7 @@ static void wake_up_reservation(struct ubifs_info *c) } /** - * wake_up_reservation - add current task in queue or start queuing. + * add_or_start_queue - add current task in queue or start queuing. * @c: UBIFS file-system description object * * This function starts queuing if queuing is not started, otherwise adds diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index a11c3dab7e16..8788740ec57f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1005,7 +1005,7 @@ out: * @c: the UBIFS file-system description object * @lp: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @lst: lprops statistics to update + * @arg: lprops statistics to update * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 778a22bf9a92..441d0beca4cf 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1918,6 +1918,7 @@ out_err: * @pnode: where to keep a pnode * @cnode: where to keep a cnode * @in_tree: is the node in the tree in memory + * @ptr: union of node pointers * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in * the tree * @ptr.pnode: ditto for pnode diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 17da28d6247a..a950c5f2560e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -29,6 +29,7 @@ * @lnum: logical eraseblock number of the node * @offs: node offset * @len: node length + * @hash: node hash * @deletion: non-zero if this entry corresponds to a node deletion * @sqnum: node sequence number * @list: links the replay list -- cgit From 4f9d406c8c90dc17470cf63342c16f66ec2d978e Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 23 May 2024 01:10:35 +0800 Subject: ubi: block: fix null-pointer-dereference in ubiblock_create() Similar to commit adbf4c4954e3 ("ubi: block: fix memleak in ubiblock_create()"), 'dev->gd' is not assigned but dereferenced if blk_mq_alloc_tag_set() fails, and leading to a null-pointer-dereference. Fix it by using pr_err() and variable 'dev' to print error log. Additionally, the log in the error handle path of idr_alloc() has been improved by using pr_err(), too. Before initializing device name, using dev_err() will print error log with 'null' instead of the actual device name, like this: block (null): ... ~~~~~~ It is unclear. Using pr_err() can print more details of the device. The improved log is: ubiblock0_0: ... Fixes: 77567b25ab9f ("ubi: use blk_mq_alloc_disk and blk_cleanup_disk") Reported-by: Dan Carpenter Signed-off-by: Li Nan Reviewed-by: Zhihao Cheng Reviewed-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/block.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index f82e3423acb9..bf7308e8ec2f 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -390,7 +390,8 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = blk_mq_alloc_tag_set(&dev->tag_set); if (ret) { - dev_err(disk_to_dev(dev->gd), "blk_mq_alloc_tag_set failed"); + pr_err("ubiblock%d_%d: blk_mq_alloc_tag_set failed\n", + dev->ubi_num, dev->vol_id); goto out_free_dev; } @@ -407,8 +408,8 @@ int ubiblock_create(struct ubi_volume_info *vi) gd->minors = 1; gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL); if (gd->first_minor < 0) { - dev_err(disk_to_dev(gd), - "block: dynamic minor allocation failed"); + pr_err("ubiblock%d_%d: block: dynamic minor allocation failed\n", + dev->ubi_num, dev->vol_id); ret = -ENODEV; goto out_cleanup_disk; } -- cgit From 25e79a7f2c9e54872fb2c8932bb582a500284505 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 4 Jun 2024 19:32:07 +0800 Subject: ubifs: Fix inconsistent inode size when powercut happens during appendant writing UBIFS always make sure that the data length won't beyond the inode size by writing inode before writing page(See ubifs_writepage.). After commit c35acef383f4a2f2cfc30("ubifs: Convert ubifs_writepage to use a folio"), the rule is broken in one case: Given a file with size 3, then write 4096 from the offset 0, following process will make inode size be smaller than file data length after powercut & recovery: P1 P2 ubifs_writepage len = folio_size(folio) // 4096 if (folio_pos(folio) + len <= i_size) // condition 1: 0 + 4096 <= 4096 //(i_size is updated as 4096 in ubifs_write_end) if (folio_pos(folio) >= synced_i_size) // condition 2: 0 >= 3, false write_inode // Skipped, because condition 2 is false do_writepage(folio, len) // write one page do_commit // data node won't be replayed in next mounting >> Powercut << So, inode size(4096) is not updated into disk, we will get following error messages in next mounting(chk_fs = 1): check_leaf [ubifs]: data node at LEB 14:2048 is not within inode size 3 dbg_walk_index [ubifs]: leaf checking function returned error -22, for leaf at LEB 14:2048 Fix it by modifying condition 2 as original comparison(Compare the page index of synced_i_size with current page index). Fixes: c35acef383f4 ("ubifs: Convert ubifs_writepage to use a folio") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218934 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a1f46919934c..68e104423a48 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1027,7 +1027,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, /* Is the folio fully inside i_size? */ if (folio_pos(folio) + len <= i_size) { - if (folio_pos(folio) >= synced_i_size) { + if (folio_pos(folio) + len > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; -- cgit From 054fd15984454f031611d6c63675fc578aad0cb1 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Thu, 20 Jun 2024 16:19:26 +0800 Subject: ubifs: add check for crypto_shash_tfm_digest Add check for the return value of crypto_shash_tfm_digest() and return the error if it fails in order to catch the error. Fixes: 817aa094842d ("ubifs: support offline signed images") Signed-off-by: Chen Ni Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/master.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 7adc37c10b6a..a148760fa49e 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -67,10 +67,13 @@ static int mst_node_check_hash(const struct ubifs_info *c, { u8 calc[UBIFS_MAX_HASH_LEN]; const void *node = mst; + int ret; - crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), + ret = crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + if (ret) + return ret; if (ubifs_check_hash(c, expected, calc)) return -EPERM; -- cgit From 7164122e25b18806f5dce68c8a0bdaa9e4f902a5 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 15 Jul 2024 14:41:12 +0100 Subject: regulator: renesas-usb-vbus-regulator: Update the default As the "rzg2l-usb-vbus-regulator" platform device is only created by drivers/reset/reset-rzg2l-usbphy-ctrl.c, update the default stricter by replacing ARCH_RZG2L->RESET_RZG2L_USBPHY_CTRL. Signed-off-by: Biju Das Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/all/CAMuHMdX5ayWbLEEa6nAipECVB6H9eCpRg21pu3zYrTdiER0F+Q@mail.gmail.com/ Link: https://patch.msgid.link/20240715134120.312610-1-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 0281a9a6f4ce..5be44606ac15 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1639,7 +1639,7 @@ config REGULATOR_RZG2L_VBCTRL depends on ARCH_RZG2L || COMPILE_TEST depends on OF select REGMAP_MMIO - default ARCH_RZG2L + default RESET_RZG2L_USBPHY_CTRL help Support for VBUS regulators implemented on Renesas RZ/G2L SoCs. -- cgit From 7c78fdbace0ff9bd2a2f1206182c054462b006af Mon Sep 17 00:00:00 2001 From: Maíra Canal Date: Sun, 14 Jul 2024 11:49:11 -0300 Subject: drm/v3d: Add V3D tech revision to the device information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The V3D tech revision can be a useful information when configuring jobs. Therefore, expose it in the `struct v3d_dev` with the V3D tech version. Signed-off-by: Maíra Canal Reviewed-by: Iago Toral Quiroga Link: https://patchwork.freedesktop.org/patch/msgid/20240714145243.1223131-1-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_drv.c | 5 ++++- drivers/gpu/drm/v3d/v3d_drv.h | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c index a47f00b443d3..5982941d933b 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.c +++ b/drivers/gpu/drm/v3d/v3d_drv.c @@ -265,7 +265,7 @@ static int v3d_platform_drm_probe(struct platform_device *pdev) struct v3d_dev *v3d; int ret; u32 mmu_debug; - u32 ident1; + u32 ident1, ident3; u64 mask; v3d = devm_drm_dev_alloc(dev, &v3d_drm_driver, struct v3d_dev, drm); @@ -298,6 +298,9 @@ static int v3d_platform_drm_probe(struct platform_device *pdev) v3d->cores = V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_NCORES); WARN_ON(v3d->cores > 1); /* multicore not yet implemented */ + ident3 = V3D_READ(V3D_HUB_IDENT3); + v3d->rev = V3D_GET_FIELD(ident3, V3D_HUB_IDENT3_IPREV); + if (v3d->ver >= 71) v3d->max_counters = V3D_V71_NUM_PERFCOUNTERS; else if (v3d->ver >= 42) diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index 099b962bdfde..49089eefb7c7 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -98,10 +98,12 @@ struct v3d_perfmon { struct v3d_dev { struct drm_device drm; - /* Short representation (e.g. 33, 41) of the V3D tech version - * and revision. - */ + /* Short representation (e.g. 33, 41) of the V3D tech version */ int ver; + + /* Short representation (e.g. 5, 6) of the V3D tech revision */ + int rev; + bool single_irq_line; /* Different revisions of V3D have different total number of performance -- cgit From 1fe1c66274fb80cc1ac42e0d38917d53adc7d7a3 Mon Sep 17 00:00:00 2001 From: Maíra Canal Date: Sun, 14 Jul 2024 11:49:12 -0300 Subject: drm/v3d: Fix Indirect Dispatch configuration for V3D 7.1.6 and later MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `args->cfg[4]` is configured in Indirect Dispatch using the number of batches. Currently, for all V3D tech versions, `args->cfg[4]` equals the number of batches subtracted by 1. But, for V3D 7.1.6 and later, we must not subtract 1 from the number of batches. Implement the fix by checking the V3D tech version and revision. Fixes several `dEQP-VK.synchronization*` CTS tests related to Indirect Dispatch. Fixes: 18b8413b25b7 ("drm/v3d: Create a CPU job extension for a indirect CSD job") Signed-off-by: Maíra Canal Reviewed-by: Iago Toral Quiroga Link: https://patchwork.freedesktop.org/patch/msgid/20240714145243.1223131-2-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 03df37a3acf5..271a6d0f5aca 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -331,7 +331,8 @@ v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job) struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect); struct drm_v3d_submit_csd *args = &indirect_csd->job->args; - u32 *wg_counts; + struct v3d_dev *v3d = job->base.v3d; + u32 num_batches, *wg_counts; v3d_get_bo_vaddr(bo); v3d_get_bo_vaddr(indirect); @@ -344,8 +345,17 @@ v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job) args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; - args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) * - (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + + num_batches = DIV_ROUND_UP(indirect_csd->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]); + + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (v3d->ver < 71 || (v3d->ver == 71 && v3d->rev < 6)) + args->cfg[4] = num_batches - 1; + else + args->cfg[4] = num_batches; + + WARN_ON(args->cfg[4] == ~0); for (int i = 0; i < 3; i++) { /* 0xffffffff indicates that the uniform rewrite is not needed */ -- cgit From 2634f745eac25a33f032df32cf98fca8538a534a Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Mon, 15 Jul 2024 18:16:53 +0300 Subject: ASoC: SOF: imx8m: Fix DSP control regmap retrieval According to Documentation/devicetree/bindings/dsp/fsl,dsp.yaml fsl,dsp-ctrl is a phandle to syscon block so we need to use correct function to retrieve it. Currently there is no SOF DSP DTS merged into mainline so there is no need to support the old way of retrieving the dsp control node. Fixes: 9ba23717b292 ("ASoC: SOF: imx8m: Implement DSP start") Signed-off-by: Daniel Baluta Link: https://patch.msgid.link/20240715151653.114751-1-daniel.baluta@oss.nxp.com Signed-off-by: Mark Brown --- sound/soc/sof/imx/imx8m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/imx/imx8m.c b/sound/soc/sof/imx/imx8m.c index 1c7019c3cbd3..cdd1e79ef9f6 100644 --- a/sound/soc/sof/imx/imx8m.c +++ b/sound/soc/sof/imx/imx8m.c @@ -234,7 +234,7 @@ static int imx8m_probe(struct snd_sof_dev *sdev) /* set default mailbox offset for FW ready message */ sdev->dsp_box.offset = MBOX_OFFSET; - priv->regmap = syscon_regmap_lookup_by_compatible("fsl,dsp-ctrl"); + priv->regmap = syscon_regmap_lookup_by_phandle(np, "fsl,dsp-ctrl"); if (IS_ERR(priv->regmap)) { dev_err(sdev->dev, "cannot find dsp-ctrl registers"); ret = PTR_ERR(priv->regmap); -- cgit From 502a582b8dd897d9282db47c0911d5320ef2e6b9 Mon Sep 17 00:00:00 2001 From: Naga Sureshkumar Relli Date: Mon, 15 Jul 2024 12:13:52 +0100 Subject: spi: microchip-core: fix the issues in the isr It is possible for the TXDONE interrupt be raised if the tx FIFO becomes temporarily empty while transmitting, resulting in recursive calls to mchp_corespi_write_fifo() and therefore a garbage message might be transmitted depending on when the interrupt is triggered. Moving all of the tx FIFO writes out of the TXDONE portion of the interrupt handler avoids this problem. Most of rest of the TXDONE portion of the handler is problematic too. Only reading the rx FIFO (and finalising the transfer) when the TXDONE interrupt is raised can cause the transfer to stall, if the final bytes of rx data are not available in the rx FIFO when the final TXDONE interrupt is raised. The transfer should be finalised regardless of which interrupt is raised, provided that all tx data has been set and all rx data received. The first issue was encountered "in the wild", the second is theoretical. Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers") Signed-off-by: Naga Sureshkumar Relli Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20240715-candied-deforest-585685ef3c8a@wendy Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index 6246254e1dff..c8076333d179 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -383,21 +383,18 @@ static irqreturn_t mchp_corespi_interrupt(int irq, void *dev_id) if (intfield == 0) return IRQ_NONE; - if (intfield & INT_TXDONE) { + if (intfield & INT_TXDONE) mchp_corespi_write(spi, REG_INT_CLEAR, INT_TXDONE); + if (intfield & INT_RXRDY) { + mchp_corespi_write(spi, REG_INT_CLEAR, INT_RXRDY); + if (spi->rx_len) mchp_corespi_read_fifo(spi); - - if (spi->tx_len) - mchp_corespi_write_fifo(spi); - - if (!spi->rx_len) - finalise = true; } - if (intfield & INT_RXRDY) - mchp_corespi_write(spi, REG_INT_CLEAR, INT_RXRDY); + if (!spi->rx_len && !spi->tx_len) + finalise = true; if (intfield & INT_RX_CHANNEL_OVERFLOW) { mchp_corespi_write(spi, REG_INT_CLEAR, INT_RX_CHANNEL_OVERFLOW); @@ -482,8 +479,9 @@ static int mchp_corespi_transfer_one(struct spi_controller *host, mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH) ? FIFO_DEPTH : spi->tx_len); - if (spi->tx_len) + while (spi->tx_len) mchp_corespi_write_fifo(spi); + return 1; } -- cgit From 22fd98c107c792e35db7abe45298bc3a29bf4723 Mon Sep 17 00:00:00 2001 From: Steve Wilkins Date: Mon, 15 Jul 2024 12:13:53 +0100 Subject: spi: microchip-core: defer asserting chip select until just before write to TX FIFO Setting up many of the registers for a new SPI transfer requires the SPI controller to be disabled after set_cs() has been called to assert the chip select line. However, disabling the controller results in the SCLK and MOSI output pins being tristate, which can cause clock transitions to be seen by a slave device whilst SS is active. To fix this, the CS is only set to inactive inline, whilst setting it active is deferred until all registers are set up and the any controller disables have been completed. Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers") Signed-off-by: Steve Wilkins Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20240715-sanitizer-recant-dd96b7a97048@wendy Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index c8076333d179..a78900db2397 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -103,6 +103,7 @@ struct mchp_corespi { u8 *rx_buf; u32 clk_gen; /* divider for spi output clock generated by the controller */ u32 clk_mode; + u32 pending_slave_select; int irq; int tx_len; int rx_len; @@ -249,8 +250,18 @@ static void mchp_corespi_set_cs(struct spi_device *spi, bool disable) reg = mchp_corespi_read(corespi, REG_SLAVE_SELECT); reg &= ~BIT(spi_get_chipselect(spi, 0)); reg |= !disable << spi_get_chipselect(spi, 0); + corespi->pending_slave_select = reg; - mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg); + /* + * Only deassert chip select immediately. Writing to some registers + * requires the controller to be disabled, which results in the + * output pins being tristated and can cause the SCLK and MOSI lines + * to transition. Therefore asserting the chip select is deferred + * until just before writing to the TX FIFO, to ensure the device + * doesn't see any spurious clock transitions whilst CS is enabled. + */ + if (((spi->mode & SPI_CS_HIGH) == 0) == disable) + mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg); } static int mchp_corespi_setup(struct spi_device *spi) @@ -269,6 +280,7 @@ static int mchp_corespi_setup(struct spi_device *spi) if (spi->mode & SPI_CS_HIGH) { reg = mchp_corespi_read(corespi, REG_SLAVE_SELECT); reg |= BIT(spi_get_chipselect(spi, 0)); + corespi->pending_slave_select = reg; mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg); } return 0; @@ -310,7 +322,8 @@ static void mchp_corespi_init(struct spi_controller *host, struct mchp_corespi * * select is relinquished to the hardware. SSELOUT is enabled too so we * can deal with active high targets. */ - mchp_corespi_write(spi, REG_SLAVE_SELECT, SSELOUT | SSEL_DIRECT); + spi->pending_slave_select = SSELOUT | SSEL_DIRECT; + mchp_corespi_write(spi, REG_SLAVE_SELECT, spi->pending_slave_select); control = mchp_corespi_read(spi, REG_CONTROL); @@ -479,6 +492,8 @@ static int mchp_corespi_transfer_one(struct spi_controller *host, mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH) ? FIFO_DEPTH : spi->tx_len); + mchp_corespi_write(spi, REG_SLAVE_SELECT, spi->pending_slave_select); + while (spi->tx_len) mchp_corespi_write_fifo(spi); -- cgit From de9850b5c606b754dd7861678d6e2874b96b04f8 Mon Sep 17 00:00:00 2001 From: Steve Wilkins Date: Mon, 15 Jul 2024 12:13:54 +0100 Subject: spi: microchip-core: only disable SPI controller when register value change requires it Setting up many of the registers for a new SPI transfer involves unconditionally disabling the SPI controller, writing the register value and re-enabling the controller. This is being done for registers even when the value is unchanged and is also done for registers that don't require the controller to be disabled for the change to take effect. Make an effort to detect changes to the register values, and only disables the controller if the new register value is different and disabling the controller is required. This stops the controller being repeated disabled and the bus going tristate before every transfer. Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers") Signed-off-by: Steve Wilkins Co-developed-by: Conor Dooley Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20240715-depict-twirl-7e592eeabaad@wendy Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core.c | 79 +++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index a78900db2397..c69d45e5bff2 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -75,6 +75,7 @@ #define REG_CONTROL (0x00) #define REG_FRAME_SIZE (0x04) +#define FRAME_SIZE_MASK GENMASK(5, 0) #define REG_STATUS (0x08) #define REG_INT_CLEAR (0x0c) #define REG_RX_DATA (0x10) @@ -89,6 +90,7 @@ #define REG_RIS (0x24) #define REG_CONTROL2 (0x28) #define REG_COMMAND (0x2c) +#define COMMAND_CLRFRAMECNT BIT(4) #define REG_PKTSIZE (0x30) #define REG_CMD_SIZE (0x34) #define REG_HWSTATUS (0x38) @@ -149,62 +151,59 @@ static inline void mchp_corespi_read_fifo(struct mchp_corespi *spi) static void mchp_corespi_enable_ints(struct mchp_corespi *spi) { - u32 control, mask = INT_ENABLE_MASK; - - mchp_corespi_disable(spi); - - control = mchp_corespi_read(spi, REG_CONTROL); - - control |= mask; - mchp_corespi_write(spi, REG_CONTROL, control); + u32 control = mchp_corespi_read(spi, REG_CONTROL); - control |= CONTROL_ENABLE; + control |= INT_ENABLE_MASK; mchp_corespi_write(spi, REG_CONTROL, control); } static void mchp_corespi_disable_ints(struct mchp_corespi *spi) { - u32 control, mask = INT_ENABLE_MASK; - - mchp_corespi_disable(spi); - - control = mchp_corespi_read(spi, REG_CONTROL); - control &= ~mask; - mchp_corespi_write(spi, REG_CONTROL, control); + u32 control = mchp_corespi_read(spi, REG_CONTROL); - control |= CONTROL_ENABLE; + control &= ~INT_ENABLE_MASK; mchp_corespi_write(spi, REG_CONTROL, control); } static inline void mchp_corespi_set_xfer_size(struct mchp_corespi *spi, int len) { u32 control; - u16 lenpart; + u32 lenpart; + u32 frames = mchp_corespi_read(spi, REG_FRAMESUP); /* - * Disable the SPI controller. Writes to transfer length have - * no effect when the controller is enabled. + * Writing to FRAMECNT in REG_CONTROL will reset the frame count, taking + * a shortcut requires an explicit clear. */ - mchp_corespi_disable(spi); + if (frames == len) { + mchp_corespi_write(spi, REG_COMMAND, COMMAND_CLRFRAMECNT); + return; + } /* * The lower 16 bits of the frame count are stored in the control reg * for legacy reasons, but the upper 16 written to a different register: * FRAMESUP. While both the upper and lower bits can be *READ* from the - * FRAMESUP register, writing to the lower 16 bits is a NOP + * FRAMESUP register, writing to the lower 16 bits is (supposedly) a NOP. + * + * The driver used to disable the controller while modifying the frame + * count, and mask off the lower 16 bits of len while writing to + * FRAMES_UP. When the driver was changed to disable the controller as + * infrequently as possible, it was discovered that the logic of + * lenpart = len & 0xffff_0000 + * write(REG_FRAMESUP, lenpart) + * would actually write zeros into the lower 16 bits on an mpfs250t-es, + * despite documentation stating these bits were read-only. + * Writing len unmasked into FRAMES_UP ensures those bits aren't zeroed + * on an mpfs250t-es and will be a NOP for the lower 16 bits on hardware + * that matches the documentation. */ lenpart = len & 0xffff; - control = mchp_corespi_read(spi, REG_CONTROL); control &= ~CONTROL_FRAMECNT_MASK; control |= lenpart << CONTROL_FRAMECNT_SHIFT; mchp_corespi_write(spi, REG_CONTROL, control); - - lenpart = len & 0xffff0000; - mchp_corespi_write(spi, REG_FRAMESUP, lenpart); - - control |= CONTROL_ENABLE; - mchp_corespi_write(spi, REG_CONTROL, control); + mchp_corespi_write(spi, REG_FRAMESUP, len); } static inline void mchp_corespi_write_fifo(struct mchp_corespi *spi) @@ -227,17 +226,22 @@ static inline void mchp_corespi_write_fifo(struct mchp_corespi *spi) static inline void mchp_corespi_set_framesize(struct mchp_corespi *spi, int bt) { + u32 frame_size = mchp_corespi_read(spi, REG_FRAME_SIZE); u32 control; + if ((frame_size & FRAME_SIZE_MASK) == bt) + return; + /* * Disable the SPI controller. Writes to the frame size have * no effect when the controller is enabled. */ - mchp_corespi_disable(spi); + control = mchp_corespi_read(spi, REG_CONTROL); + control &= ~CONTROL_ENABLE; + mchp_corespi_write(spi, REG_CONTROL, control); mchp_corespi_write(spi, REG_FRAME_SIZE, bt); - control = mchp_corespi_read(spi, REG_CONTROL); control |= CONTROL_ENABLE; mchp_corespi_write(spi, REG_CONTROL, control); } @@ -337,8 +341,6 @@ static inline void mchp_corespi_set_clk_gen(struct mchp_corespi *spi) { u32 control; - mchp_corespi_disable(spi); - control = mchp_corespi_read(spi, REG_CONTROL); if (spi->clk_mode) control |= CONTROL_CLKMODE; @@ -347,12 +349,12 @@ static inline void mchp_corespi_set_clk_gen(struct mchp_corespi *spi) mchp_corespi_write(spi, REG_CLK_GEN, spi->clk_gen); mchp_corespi_write(spi, REG_CONTROL, control); - mchp_corespi_write(spi, REG_CONTROL, control | CONTROL_ENABLE); } static inline void mchp_corespi_set_mode(struct mchp_corespi *spi, unsigned int mode) { - u32 control, mode_val; + u32 mode_val; + u32 control = mchp_corespi_read(spi, REG_CONTROL); switch (mode & SPI_MODE_X_MASK) { case SPI_MODE_0: @@ -370,12 +372,13 @@ static inline void mchp_corespi_set_mode(struct mchp_corespi *spi, unsigned int } /* - * Disable the SPI controller. Writes to the frame size have + * Disable the SPI controller. Writes to the frame protocol have * no effect when the controller is enabled. */ - mchp_corespi_disable(spi); - control = mchp_corespi_read(spi, REG_CONTROL); + control &= ~CONTROL_ENABLE; + mchp_corespi_write(spi, REG_CONTROL, control); + control &= ~(SPI_MODE_X_MASK << MODE_X_MASK_SHIFT); control |= mode_val; -- cgit From 3a5e76283672efddf47cea39ccfe9f5735cc91d5 Mon Sep 17 00:00:00 2001 From: Steve Wilkins Date: Mon, 15 Jul 2024 12:13:55 +0100 Subject: spi: microchip-core: fix init function not setting the master and motorola modes mchp_corespi_init() reads the CONTROL register, sets the master and motorola bits, but doesn't write the value back to the register. The function also doesn't ensure the controller is disabled at the start, which may present a problem if the controller was used by an earlier boot stage as some settings (including the mode) can only be modified while the controller is disabled. Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers") Signed-off-by: Steve Wilkins Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20240715-designing-thus-05f7c26e1da7@wendy Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index c69d45e5bff2..aa72b9dd8956 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -295,17 +295,13 @@ static void mchp_corespi_init(struct spi_controller *host, struct mchp_corespi * unsigned long clk_hz; u32 control = mchp_corespi_read(spi, REG_CONTROL); - control |= CONTROL_MASTER; + control &= ~CONTROL_ENABLE; + mchp_corespi_write(spi, REG_CONTROL, control); + control |= CONTROL_MASTER; control &= ~CONTROL_MODE_MASK; control |= MOTOROLA_MODE; - mchp_corespi_set_framesize(spi, DEFAULT_FRAMESIZE); - - /* max. possible spi clock rate is the apb clock rate */ - clk_hz = clk_get_rate(spi->clk); - host->max_speed_hz = clk_hz; - /* * The controller must be configured so that it doesn't remove Chip * Select until the entire message has been transferred, even if at @@ -314,11 +310,16 @@ static void mchp_corespi_init(struct spi_controller *host, struct mchp_corespi * * BIGFIFO mode is also enabled, which sets the fifo depth to 32 frames * for the 8 bit transfers that this driver uses. */ - control = mchp_corespi_read(spi, REG_CONTROL); control |= CONTROL_SPS | CONTROL_BIGFIFO; mchp_corespi_write(spi, REG_CONTROL, control); + mchp_corespi_set_framesize(spi, DEFAULT_FRAMESIZE); + + /* max. possible spi clock rate is the apb clock rate */ + clk_hz = clk_get_rate(spi->clk); + host->max_speed_hz = clk_hz; + mchp_corespi_enable_ints(spi); /* -- cgit From 9cf71eb0faef4bff01df4264841b8465382d7927 Mon Sep 17 00:00:00 2001 From: Steve Wilkins Date: Mon, 15 Jul 2024 12:13:56 +0100 Subject: spi: microchip-core: ensure TX and RX FIFOs are empty at start of a transfer While transmitting with rx_len == 0, the RX FIFO is not going to be emptied in the interrupt handler. A subsequent transfer could then read crap from the previous transfer out of the RX FIFO into the start RX buffer. The core provides a register that will empty the RX and TX FIFOs, so do that before each transfer. Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers") Signed-off-by: Steve Wilkins Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20240715-flammable-provoke-459226d08e70@wendy Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index aa72b9dd8956..057092506bd5 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -91,6 +91,8 @@ #define REG_CONTROL2 (0x28) #define REG_COMMAND (0x2c) #define COMMAND_CLRFRAMECNT BIT(4) +#define COMMAND_TXFIFORST BIT(3) +#define COMMAND_RXFIFORST BIT(2) #define REG_PKTSIZE (0x30) #define REG_CMD_SIZE (0x34) #define REG_HWSTATUS (0x38) @@ -496,6 +498,8 @@ static int mchp_corespi_transfer_one(struct spi_controller *host, mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH) ? FIFO_DEPTH : spi->tx_len); + mchp_corespi_write(spi, REG_COMMAND, COMMAND_RXFIFORST | COMMAND_TXFIFORST); + mchp_corespi_write(spi, REG_SLAVE_SELECT, spi->pending_slave_select); while (spi->tx_len) -- cgit From 87232ea8a5caf8d050f8ea7acd210a2cfcbe6309 Mon Sep 17 00:00:00 2001 From: Steve Wilkins Date: Mon, 15 Jul 2024 12:13:57 +0100 Subject: spi: microchip-core: add support for word sizes of 1 to 32 bits The current implementation only supports a word size of 8 bits, which limits the devices it can be used with. Add support for any word size between 1 and 32 bits, as supported by the hardware. Signed-off-by: Steve Wilkins Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20240715-cogwheel-uniquely-0d4ef518b809@wendy Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core.c | 53 +++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index 057092506bd5..7c1a9a985373 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -111,7 +111,7 @@ struct mchp_corespi { int irq; int tx_len; int rx_len; - int pending; + int n_bytes; }; static inline u32 mchp_corespi_read(struct mchp_corespi *spi, unsigned int reg) @@ -135,20 +135,23 @@ static inline void mchp_corespi_disable(struct mchp_corespi *spi) static inline void mchp_corespi_read_fifo(struct mchp_corespi *spi) { - u8 data; - int fifo_max, i = 0; + while (spi->rx_len >= spi->n_bytes && !(mchp_corespi_read(spi, REG_STATUS) & STATUS_RXFIFO_EMPTY)) { + u32 data = mchp_corespi_read(spi, REG_RX_DATA); - fifo_max = min(spi->rx_len, FIFO_DEPTH); + spi->rx_len -= spi->n_bytes; - while ((i < fifo_max) && !(mchp_corespi_read(spi, REG_STATUS) & STATUS_RXFIFO_EMPTY)) { - data = mchp_corespi_read(spi, REG_RX_DATA); + if (!spi->rx_buf) + continue; - if (spi->rx_buf) - *spi->rx_buf++ = data; - i++; + if (spi->n_bytes == 4) + *((u32 *)spi->rx_buf) = data; + else if (spi->n_bytes == 2) + *((u16 *)spi->rx_buf) = data; + else + *spi->rx_buf = data; + + spi->rx_buf += spi->n_bytes; } - spi->rx_len -= i; - spi->pending -= i; } static void mchp_corespi_enable_ints(struct mchp_corespi *spi) @@ -210,20 +213,28 @@ static inline void mchp_corespi_set_xfer_size(struct mchp_corespi *spi, int len) static inline void mchp_corespi_write_fifo(struct mchp_corespi *spi) { - u8 byte; int fifo_max, i = 0; - fifo_max = min(spi->tx_len, FIFO_DEPTH); + fifo_max = DIV_ROUND_UP(min(spi->tx_len, FIFO_DEPTH), spi->n_bytes); mchp_corespi_set_xfer_size(spi, fifo_max); while ((i < fifo_max) && !(mchp_corespi_read(spi, REG_STATUS) & STATUS_TXFIFO_FULL)) { - byte = spi->tx_buf ? *spi->tx_buf++ : 0xaa; - mchp_corespi_write(spi, REG_TX_DATA, byte); + u32 word; + + if (spi->n_bytes == 4) + word = spi->tx_buf ? *((u32 *)spi->tx_buf) : 0xaa; + else if (spi->n_bytes == 2) + word = spi->tx_buf ? *((u16 *)spi->tx_buf) : 0xaa; + else + word = spi->tx_buf ? *spi->tx_buf : 0xaa; + + mchp_corespi_write(spi, REG_TX_DATA, word); + if (spi->tx_buf) + spi->tx_buf += spi->n_bytes; i++; } - spi->tx_len -= i; - spi->pending += i; + spi->tx_len -= i * spi->n_bytes; } static inline void mchp_corespi_set_framesize(struct mchp_corespi *spi, int bt) @@ -493,10 +504,9 @@ static int mchp_corespi_transfer_one(struct spi_controller *host, spi->rx_buf = xfer->rx_buf; spi->tx_len = xfer->len; spi->rx_len = xfer->len; - spi->pending = 0; + spi->n_bytes = roundup_pow_of_two(DIV_ROUND_UP(xfer->bits_per_word, BITS_PER_BYTE)); - mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH) - ? FIFO_DEPTH : spi->tx_len); + mchp_corespi_set_framesize(spi, xfer->bits_per_word); mchp_corespi_write(spi, REG_COMMAND, COMMAND_RXFIFORST | COMMAND_TXFIFORST); @@ -514,7 +524,6 @@ static int mchp_corespi_prepare_message(struct spi_controller *host, struct spi_device *spi_dev = msg->spi; struct mchp_corespi *spi = spi_controller_get_devdata(host); - mchp_corespi_set_framesize(spi, DEFAULT_FRAMESIZE); mchp_corespi_set_mode(spi, spi_dev->mode); return 0; @@ -542,7 +551,7 @@ static int mchp_corespi_probe(struct platform_device *pdev) host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; host->use_gpio_descriptors = true; host->setup = mchp_corespi_setup; - host->bits_per_word_mask = SPI_BPW_MASK(8); + host->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32); host->transfer_one = mchp_corespi_transfer_one; host->prepare_message = mchp_corespi_prepare_message; host->set_cs = mchp_corespi_set_cs; -- cgit From ab091ec536cb7b271983c0c063b17f62f3591583 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Mon, 15 Jul 2024 17:31:44 +0800 Subject: nvme/pci: Add APST quirk for Lenovo N60z laptop There is a hardware power-saving problem with the Lenovo N60z board. When turn it on and leave it for 10 hours, there is a 20% chance that a nvme disk will not wake up until reboot. Link: https://lore.kernel.org/all/2B5581C46AC6E335+9c7a81f1-05fb-4fd0-9fbb-108757c21628@uniontech.com Signed-off-by: hmy Signed-off-by: Wentao Guan Signed-off-by: WangYuli Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 21f8e1b9801f..bc24a85e96ce 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2968,6 +2968,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; } + /* + * NVMe SSD drops off the PCIe bus after system idle + * for 10 hours on a Lenovo N60z board. + */ + if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) + return NVME_QUIRK_NO_APST; + return 0; } -- cgit From 1a7812b25e69f6c63c52d3bc93c0970ab7ad9660 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 13 Jul 2024 15:43:17 +0200 Subject: nvme-fabrics: Use seq_putc() in __nvmf_concat_opt_tokens() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single characters should be put into a sequence. Thus use the corresponding function “seq_putc”. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 44e342a46f39..f5f545fa0103 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1403,10 +1403,10 @@ static void __nvmf_concat_opt_tokens(struct seq_file *seq_file) tok = &opt_tokens[idx]; if (tok->token == NVMF_OPT_ERR) continue; - seq_puts(seq_file, ","); + seq_putc(seq_file, ','); seq_puts(seq_file, tok->pattern); } - seq_puts(seq_file, "\n"); + seq_putc(seq_file, '\n'); } static int nvmf_dev_show(struct seq_file *seq_file, void *private) -- cgit From 88c918d1ee2c88cc5be3aa67ce048414fee471ff Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 11 Jul 2024 14:40:53 +0300 Subject: nvme: remove redundant bdev local variable Use disk directly instead of getting it from bdev->bd_disk. Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3c55f7edd181..ba05faaac562 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -233,13 +233,12 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, { struct nvme_ns_head *head = dev_to_ns_head(dev); struct gendisk *disk = dev_to_disk(dev); - struct block_device *bdev = disk->part0; int ret; - if (nvme_disk_is_ns_head(bdev->bd_disk)) + if (nvme_disk_is_ns_head(disk)) ret = ns_head_update_nuse(head); else - ret = ns_update_nuse(bdev->bd_disk->private_data); + ret = ns_update_nuse(disk->private_data); if (ret) return ret; -- cgit From 92fc2c469eb26060384e9b2cd4cb0cc228aba582 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 11 Jul 2024 15:59:52 -0700 Subject: nvme-pci: Fix the instructions for disabling power management pcie_aspm=off tells the kernel not to modify the ASPM configuration. This setting does not guarantee that ASPM (Active State Power Management) is disabled. Hence add pcie_port_pm=off. This disables power management for all PCIe ports. This patch has been tested on a workstation with a Samsung SSD 970 EVO Plus NVMe SSD. Fixes: 4641a8e6e145 ("nvme-pci: add trouble shooting steps for timeouts") Cc: Keith Busch Cc: Christoph Hellwig Cc: Chaitanya Kulkarni Signed-off-by: Bart Van Assche Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bc24a85e96ce..8087b054da66 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1309,7 +1309,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) dev_warn(dev->ctrl.device, "Does your device have a faulty power saving mode enabled?\n"); dev_warn(dev->ctrl.device, - "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n"); + "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n"); } static enum blk_eh_timer_return nvme_timeout(struct request *req) -- cgit From 28814be8823002eca06d857d4bce70eb4c6fccd3 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Wed, 10 Jul 2024 20:20:47 +0000 Subject: drm/amd: Bump KMS_DRIVER_MINOR version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase the KMS minor version to indicate GFX12 DCC support since this contains a major change in how DCC is managed across IPs like GFX, DCN etc. This will be used mainly by userspace like Mesa to figure out DCC support on GFX12 hardware. v2: fix version number (Alex) Signed-off-by: Aurabindo Pillai Acked-by: Alex Deucher Reviewed-by: Marek Olšák Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 78089f2f79f5..094498a0964b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -116,9 +116,10 @@ * - 3.55.0 - Add AMDGPU_INFO_GPUVM_FAULT query * - 3.56.0 - Update IB start address and size alignment for decode and encode * - 3.57.0 - Compute tunneling on GFX10+ + * - 3.58.0 - Add GFX12 DCC support */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 57 +#define KMS_DRIVER_MINOR 58 #define KMS_DRIVER_PATCHLEVEL 0 /* -- cgit From 7bbae44cf1bda02537a84cd8ad75bd81694acfc7 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Mon, 15 Jul 2024 15:02:20 -0400 Subject: drm/amd/display: fix doc entry for bb_from_dmub Fixes the warning: Function parameter or struct member 'bb_from_dmub' not described in 'amdgpu_display_manager' Reported-by: Stephen Rothwell Signed-off-by: Aurabindo Pillai Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 5fd1b6b44577..369159c29bbe 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -587,7 +587,9 @@ struct amdgpu_display_manager { */ struct mutex dpia_aux_lock; - /* + /** + * @bb_from_dmub: + * * Bounding box data read from dmub during early initialization for DCN4+ */ struct dml2_soc_bb *bb_from_dmub; -- cgit From d13e2a6e95e6b87f571c837c71a3d05691def9bb Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 8 Jul 2024 22:00:24 +0300 Subject: drm/i915/dp: Reset intel_dp->link_trained before retraining the link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regularly retraining a link during an atomic commit happens with the given pipe/link already disabled and hence intel_dp->link_trained being false. Ensure this also for retraining a DP SST link via direct calls to the link training functions (vs. an actual commit as for DP MST). So far nothing depended on this, however the next patch will depend on link_trained==false for changing the LTTPR mode to non-transparent. Cc: # v5.15+ Cc: Ville Syrjälä Reviewed-by: Ankit Nautiyal Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20240708190029.271247-2-imre.deak@intel.com (cherry picked from commit a4d5ce61765c08ab364aa4b327f6739b646e6cfa) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_dp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 3903f6ead6e6..59f11af3b0a1 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -5314,6 +5314,8 @@ static int intel_dp_retrain_link(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state = to_intel_crtc_state(crtc->base.state); + intel_dp->link_trained = false; + intel_dp_check_frl_training(intel_dp); intel_dp_pcon_dsc_configure(intel_dp, crtc_state); intel_dp_start_link_train(NULL, intel_dp, crtc_state); -- cgit From 509580fad7323b6a5da27e8365cd488f3b57210e Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 8 Jul 2024 22:00:25 +0300 Subject: drm/i915/dp: Don't switch the LTTPR mode on an active link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switching to transparent mode leads to a loss of link synchronization, so prevent doing this on an active link. This happened at least on an Intel N100 system / DELL UD22 dock, the LTTPR residing either on the host or the dock. To fix the issue, keep the current mode on an active link, adjusting the LTTPR count accordingly (resetting it to 0 in transparent mode). v2: Adjust code comment during link training about reiniting the LTTPRs. (Ville) Fixes: 7b2a4ab8b0ef ("drm/i915: Switch to LTTPR transparent mode link training") Reported-and-tested-by: Gareth Yu Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/10902 Cc: # v5.15+ Cc: Ville Syrjälä Reviewed-by: Ville Syrjälä Reviewed-by: Ankit Nautiyal Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20240708190029.271247-3-imre.deak@intel.com (cherry picked from commit 211ad49cf8ccfdc798a719b4d1e000d0a8a9e588) Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/display/intel_dp_link_training.c | 55 +++++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c index 1bc4ef84ff3b..d044c8e36bb3 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c +++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c @@ -117,10 +117,24 @@ intel_dp_set_lttpr_transparent_mode(struct intel_dp *intel_dp, bool enable) return drm_dp_dpcd_write(&intel_dp->aux, DP_PHY_REPEATER_MODE, &val, 1) == 1; } -static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEIVER_CAP_SIZE]) +static bool intel_dp_lttpr_transparent_mode_enabled(struct intel_dp *intel_dp) +{ + return intel_dp->lttpr_common_caps[DP_PHY_REPEATER_MODE - + DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV] == + DP_PHY_REPEATER_MODE_TRANSPARENT; +} + +/* + * Read the LTTPR common capabilities and switch the LTTPR PHYs to + * non-transparent mode if this is supported. Preserve the + * transparent/non-transparent mode on an active link. + * + * Return the number of detected LTTPRs in non-transparent mode or 0 if the + * LTTPRs are in transparent mode or the detection failed. + */ +static int intel_dp_init_lttpr_phys(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEIVER_CAP_SIZE]) { int lttpr_count; - int i; if (!intel_dp_read_lttpr_common_caps(intel_dp, dpcd)) return 0; @@ -134,6 +148,19 @@ static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEI if (lttpr_count == 0) return 0; + /* + * Don't change the mode on an active link, to prevent a loss of link + * synchronization. See DP Standard v2.0 3.6.7. about the LTTPR + * resetting its internal state when the mode is changed from + * non-transparent to transparent. + */ + if (intel_dp->link_trained) { + if (lttpr_count < 0 || intel_dp_lttpr_transparent_mode_enabled(intel_dp)) + goto out_reset_lttpr_count; + + return lttpr_count; + } + /* * See DP Standard v2.0 3.6.6.1. about the explicit disabling of * non-transparent mode and the disable->enable non-transparent mode @@ -154,11 +181,25 @@ static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEI "Switching to LTTPR non-transparent LT mode failed, fall-back to transparent mode\n"); intel_dp_set_lttpr_transparent_mode(intel_dp, true); - intel_dp_reset_lttpr_count(intel_dp); - return 0; + goto out_reset_lttpr_count; } + return lttpr_count; + +out_reset_lttpr_count: + intel_dp_reset_lttpr_count(intel_dp); + + return 0; +} + +static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEIVER_CAP_SIZE]) +{ + int lttpr_count; + int i; + + lttpr_count = intel_dp_init_lttpr_phys(intel_dp, dpcd); + for (i = 0; i < lttpr_count; i++) intel_dp_read_lttpr_phy_caps(intel_dp, dpcd, DP_PHY_LTTPR(i)); @@ -1482,10 +1523,10 @@ void intel_dp_start_link_train(struct intel_atomic_state *state, struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp); struct intel_encoder *encoder = &dig_port->base; bool passed; - /* - * TODO: Reiniting LTTPRs here won't be needed once proper connector - * HW state readout is added. + * Reinit the LTTPRs here to ensure that they are switched to + * non-transparent mode. During an earlier LTTPR detection this + * could've been prevented by an active link. */ int lttpr_count = intel_dp_init_lttpr_and_dprx_caps(intel_dp); -- cgit From 9ee3f0d8c9999eb1ef2866e86f8d57d996fc0348 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 16 Jul 2024 10:45:30 +0200 Subject: ASOC: SOF: Intel: hda-loader: only wait for HDaudio IOC for IPC4 devices Multiple users report a regression bisected to commit d5263dbbd8af ("ASoC: SOF: Intel: don't ignore IOC interrupts for non-audio transfers"). The firmware version is the likely suspect, as these users relied on SOF 2.0 while Intel only tested with the 2.2 release. Rather than completely disable the wait_for_completion(), which can help us gather timing information on the different stages of the boot process, the simplest course of action is to just disable it for older IPC versions which are no longer under active development. Closes: https://github.com/thesofproject/linux/issues/5072 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218961 Fixes: d5263dbbd8af ("ASoC: SOF: Intel: don't ignore IOC interrupts for non-audio transfers") Tested-by: Mike Krinkin Tested-by: Todd Brandt Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://patch.msgid.link/20240716084530.300829-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-loader.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/hda-loader.c b/sound/soc/sof/intel/hda-loader.c index b8b914eaf7e0..75f6240cf3e1 100644 --- a/sound/soc/sof/intel/hda-loader.c +++ b/sound/soc/sof/intel/hda-loader.c @@ -310,15 +310,19 @@ int hda_cl_copy_fw(struct snd_sof_dev *sdev, struct hdac_ext_stream *hext_stream return ret; } - /* Wait for completion of transfer */ - time_left = wait_for_completion_timeout(&hda_stream->ioc, - msecs_to_jiffies(HDA_CL_DMA_IOC_TIMEOUT_MS)); - - if (!time_left) { - dev_err(sdev->dev, "Code loader DMA did not complete\n"); - return -ETIMEDOUT; + if (sdev->pdata->ipc_type == SOF_IPC_TYPE_4) { + /* Wait for completion of transfer */ + time_left = wait_for_completion_timeout(&hda_stream->ioc, + msecs_to_jiffies(HDA_CL_DMA_IOC_TIMEOUT_MS)); + + if (!time_left) { + dev_err(sdev->dev, "Code loader DMA did not complete\n"); + return -ETIMEDOUT; + } + dev_dbg(sdev->dev, "Code loader DMA done\n"); } - dev_dbg(sdev->dev, "Code loader DMA done, waiting for FW_ENTERED status\n"); + + dev_dbg(sdev->dev, "waiting for FW_ENTERED status\n"); status = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, chip->rom_status_reg, reg, -- cgit From 6f6a23d42bdfbbfe1b41a23e2c78319a0cc65db3 Mon Sep 17 00:00:00 2001 From: Curtis Malainey Date: Tue, 16 Jul 2024 10:40:12 +0200 Subject: ASoC: Intel: Fix RT5650 SSP lookup Commit 8efcd4864652 ("ASoC: Intel: sof_rt5682: use common module for sof_card_private initialization") migrated the pin assignment in the context struct up to soc-acpi-intel-ssp-common.c. This uses a lookup table to see if a device has a amp/codec before assigning the pin. The issue here arises when combination parts that serve both (with 2 ports) are used. sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:1f.3/adl_rt5682_def/SSP0-Codec' CPU: 1 PID: 2079 Comm: udevd Tainted: G U 6.6.36-03391-g744739e00023 #1 3be1a2880a0970f65545a957db7d08ef4b3e2c0d Hardware name: Google Anraggar/Anraggar, BIOS Google_Anraggar.15217.552.0 05/07/2024 Call Trace: dump_stack_lvl+0x69/0xa0 sysfs_warn_dup+0x5b/0x70 sysfs_create_dir_ns+0xb0/0x100 kobject_add_internal+0x133/0x3c0 kobject_add+0x66/0xb0 ? device_add+0x65/0x780 device_add+0x164/0x780 snd_soc_add_pcm_runtimes+0x2fa/0x800 snd_soc_bind_card+0x35e/0xc20 devm_snd_soc_register_card+0x48/0x90 platform_probe+0x7b/0xb0 really_probe+0xf7/0x2a0 ... kobject: kobject_add_internal failed for SSP0-Codec with -EEXIST, don't try to register things with the same name in the same directory. The issue is that the ALC5650 was only defined in the codec table and not the amp table which left the pin unassigned but the dai link was still created by the machine driver. Also patch the suffix filename code for the topology to prevent double suffix names as a result of this change. Fixes: 8efcd4864652 ("ASoC: Intel: sof_rt5682: use common module for sof_card_private initialization") Reviewed-by: Bard Liao Signed-off-by: Curtis Malainey Signed-off-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20240716084012.299257-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-ssp-common.c | 9 +++++++++ sound/soc/sof/intel/hda.c | 17 +++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c index 75d0b931d895..de7a3f7f47f1 100644 --- a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c +++ b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c @@ -64,6 +64,15 @@ static const struct codec_map amps[] = { CODEC_MAP_ENTRY("RT1015P", "rt1015", RT1015P_ACPI_HID, CODEC_RT1015P), CODEC_MAP_ENTRY("RT1019P", "rt1019", RT1019P_ACPI_HID, CODEC_RT1019P), CODEC_MAP_ENTRY("RT1308", "rt1308", RT1308_ACPI_HID, CODEC_RT1308), + + /* + * Monolithic components + * + * Only put components that can serve as both the amp and the codec below this line. + * This will ensure that if the part is used just as a codec and there is an amp as well + * then the amp will be selected properly. + */ + CODEC_MAP_ENTRY("RT5650", "rt5650", RT5650_ACPI_HID, CODEC_RT5650), }; enum snd_soc_acpi_intel_codec diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index daf364f773dd..5a40b8fbbbd3 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -1307,9 +1307,10 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev) const struct sof_dev_desc *desc = sof_pdata->desc; struct hdac_bus *bus = sof_to_bus(sdev); struct snd_soc_acpi_mach *mach = NULL; - enum snd_soc_acpi_intel_codec codec_type; + enum snd_soc_acpi_intel_codec codec_type, amp_type; const char *tplg_filename; const char *tplg_suffix; + bool amp_name_valid; /* Try I2S or DMIC if it is supported */ if (interface_mask & (BIT(SOF_DAI_INTEL_SSP) | BIT(SOF_DAI_INTEL_DMIC))) @@ -1413,15 +1414,16 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev) } } - codec_type = snd_soc_acpi_intel_detect_amp_type(sdev->dev); + amp_type = snd_soc_acpi_intel_detect_amp_type(sdev->dev); + codec_type = snd_soc_acpi_intel_detect_codec_type(sdev->dev); + amp_name_valid = amp_type != CODEC_NONE && amp_type != codec_type; - if (tplg_fixup && - mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_AMP_NAME && - codec_type != CODEC_NONE) { - tplg_suffix = snd_soc_acpi_intel_get_amp_tplg_suffix(codec_type); + if (tplg_fixup && amp_name_valid && + mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_AMP_NAME) { + tplg_suffix = snd_soc_acpi_intel_get_amp_tplg_suffix(amp_type); if (!tplg_suffix) { dev_err(sdev->dev, "no tplg suffix found, amp %d\n", - codec_type); + amp_type); return NULL; } @@ -1436,7 +1438,6 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev) add_extension = true; } - codec_type = snd_soc_acpi_intel_detect_codec_type(sdev->dev); if (tplg_fixup && mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_CODEC_NAME && -- cgit From 415fb383ec2bbe4d4bb1a1b5593cd67eb058f1de Mon Sep 17 00:00:00 2001 From: Francis Pravin Date: Mon, 15 Jul 2024 06:31:57 +0530 Subject: nvme-core: choose PIF from QPIF if QPIFS supports and PIF is QTYPE As per TP4141a: "If the Qualified Protection Information Format Support(QPIFS) bit is set to 1 and the Protection Information Format(PIF) field is set to 11b (i.e., Qualified Type), then the pif is as defined in the Qualified Protection Information Format (QPIF) field." So, choose PIF from QPIF if QPIFS supports and PIF is QTYPE. Signed-off-by: Francis Pravin Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- include/linux/nvme.h | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 448b8239bc99..e78ef31eeef0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1875,12 +1875,18 @@ static void nvme_configure_pi_elbas(struct nvme_ns_head *head, struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) { u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]); + u8 guard_type; /* no support for storage tag formats right now */ if (nvme_elbaf_sts(elbaf)) return; - head->guard_type = nvme_elbaf_guard_type(elbaf); + guard_type = nvme_elbaf_guard_type(elbaf); + if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) && + guard_type == NVME_NVM_NS_QTYPE_GUARD) + guard_type = nvme_elbaf_qualified_guard_type(elbaf); + + head->guard_type = guard_type; switch (head->guard_type) { case NVME_NVM_NS_64B_GUARD: head->pi_size = sizeof(struct crc64_pi_tuple); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 57e27e48c913..5c2290f0d5af 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -483,6 +483,9 @@ enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, NVME_ID_NS_NVM_GUARD_MASK = 0x3, + NVME_ID_NS_NVM_QPIF_SHIFT = 9, + NVME_ID_NS_NVM_QPIF_MASK = 0xf, + NVME_ID_NS_NVM_QPIFS = 1 << 3, }; static inline __u8 nvme_elbaf_sts(__u32 elbaf) @@ -495,6 +498,11 @@ static inline __u8 nvme_elbaf_guard_type(__u32 elbaf) return (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & NVME_ID_NS_NVM_GUARD_MASK; } +static inline __u8 nvme_elbaf_qualified_guard_type(__u32 elbaf) +{ + return (elbaf >> NVME_ID_NS_NVM_QPIF_SHIFT) & NVME_ID_NS_NVM_QPIF_MASK; +} + struct nvme_id_ctrl_nvm { __u8 vsl; __u8 wzsl; @@ -574,6 +582,7 @@ enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, NVME_NVM_NS_64B_GUARD = 2, + NVME_NVM_NS_QTYPE_GUARD = 3, }; static inline __u8 nvme_lbaf_index(__u8 flbas) -- cgit From ecfa23c8df7ef3ea2a429dfe039341bf792e95b4 Mon Sep 17 00:00:00 2001 From: Boyuan Zhang Date: Thu, 11 Jul 2024 16:19:54 -0400 Subject: drm/amdgpu/vcn: identify unified queue in sw init Determine whether VCN using unified queue in sw_init, instead of calling functions later on. v2: fix coding style Signed-off-by: Boyuan Zhang Acked-by: Alex Deucher Reviewed-by: Ruijing Dong Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 39 +++++++++++++-------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 1 + 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 8d65b096db90..69b2c2503d55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -147,6 +147,10 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) } } + /* from vcn4 and above, only unified queue is used */ + adev->vcn.using_unified_queue = + amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0); + hdr = (const struct common_firmware_header *)adev->vcn.fw[0]->data; adev->vcn.fw_version = le32_to_cpu(hdr->ucode_version); @@ -275,18 +279,6 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev) return 0; } -/* from vcn4 and above, only unified queue is used */ -static bool amdgpu_vcn_using_unified_queue(struct amdgpu_ring *ring) -{ - struct amdgpu_device *adev = ring->adev; - bool ret = false; - - if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) - ret = true; - - return ret; -} - bool amdgpu_vcn_is_disabled_vcn(struct amdgpu_device *adev, enum vcn_ring_type type, uint32_t vcn_instance) { bool ret = false; @@ -724,12 +716,11 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, struct amdgpu_job *job; struct amdgpu_ib *ib; uint64_t addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); - bool sq = amdgpu_vcn_using_unified_queue(ring); uint32_t *ib_checksum; uint32_t ib_pack_in_dw; int i, r; - if (sq) + if (adev->vcn.using_unified_queue) ib_size_dw += 8; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, @@ -742,7 +733,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, ib->length_dw = 0; /* single queue headers */ - if (sq) { + if (adev->vcn.using_unified_queue) { ib_pack_in_dw = sizeof(struct amdgpu_vcn_decode_buffer) / sizeof(uint32_t) + 4 + 2; /* engine info + decoding ib in dw */ ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, ib_pack_in_dw, false); @@ -761,7 +752,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, for (i = ib->length_dw; i < ib_size_dw; ++i) ib->ptr[i] = 0x0; - if (sq) + if (adev->vcn.using_unified_queue) amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, ib_pack_in_dw); r = amdgpu_job_submit_direct(job, ring, &f); @@ -851,15 +842,15 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand struct dma_fence **fence) { unsigned int ib_size_dw = 16; + struct amdgpu_device *adev = ring->adev; struct amdgpu_job *job; struct amdgpu_ib *ib; struct dma_fence *f = NULL; uint32_t *ib_checksum = NULL; uint64_t addr; - bool sq = amdgpu_vcn_using_unified_queue(ring); int i, r; - if (sq) + if (adev->vcn.using_unified_queue) ib_size_dw += 8; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, @@ -873,7 +864,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand ib->length_dw = 0; - if (sq) + if (adev->vcn.using_unified_queue) ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true); ib->ptr[ib->length_dw++] = 0x00000018; @@ -895,7 +886,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand for (i = ib->length_dw; i < ib_size_dw; ++i) ib->ptr[i] = 0x0; - if (sq) + if (adev->vcn.using_unified_queue) amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11); r = amdgpu_job_submit_direct(job, ring, &f); @@ -918,15 +909,15 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han struct dma_fence **fence) { unsigned int ib_size_dw = 16; + struct amdgpu_device *adev = ring->adev; struct amdgpu_job *job; struct amdgpu_ib *ib; struct dma_fence *f = NULL; uint32_t *ib_checksum = NULL; uint64_t addr; - bool sq = amdgpu_vcn_using_unified_queue(ring); int i, r; - if (sq) + if (adev->vcn.using_unified_queue) ib_size_dw += 8; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, @@ -940,7 +931,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han ib->length_dw = 0; - if (sq) + if (adev->vcn.using_unified_queue) ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true); ib->ptr[ib->length_dw++] = 0x00000018; @@ -962,7 +953,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han for (i = ib->length_dw; i < ib_size_dw; ++i) ib->ptr[i] = 0x0; - if (sq) + if (adev->vcn.using_unified_queue) amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11); r = amdgpu_job_submit_direct(job, ring, &f); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 9f06def236fd..1a5439abd1a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -329,6 +329,7 @@ struct amdgpu_vcn { uint16_t inst_mask; uint8_t num_inst_per_aid; + bool using_unified_queue; }; struct amdgpu_fw_shared_rb_ptrs_struct { -- cgit From 7d75ef3736a025db441be652c8cc8e84044a215f Mon Sep 17 00:00:00 2001 From: Boyuan Zhang Date: Wed, 10 Jul 2024 16:17:12 -0400 Subject: drm/amdgpu/vcn: not pause dpg for unified queue For unified queue, DPG pause for encoding is done inside VCN firmware, so there is no need to pause dpg based on ring type in kernel. For VCN3 and below, pausing DPG for encoding in kernel is still needed. v2: add more comments v3: update commit message Signed-off-by: Boyuan Zhang Acked-by: Alex Deucher Reviewed-by: Ruijing Dong Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 69b2c2503d55..43f44cc201cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -389,7 +389,9 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work) for (i = 0; i < adev->vcn.num_enc_rings; ++i) fence[j] += amdgpu_fence_count_emitted(&adev->vcn.inst[j].ring_enc[i]); - if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { + /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */ + if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG && + !adev->vcn.using_unified_queue) { struct dpg_pause_state new_state; if (fence[j] || @@ -435,7 +437,9 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring) amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN, AMD_PG_STATE_UNGATE); - if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { + /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */ + if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG && + !adev->vcn.using_unified_queue) { struct dpg_pause_state new_state; if (ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC) { @@ -461,8 +465,12 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring) void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring) { + struct amdgpu_device *adev = ring->adev; + + /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */ if (ring->adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG && - ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC) + ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC && + !adev->vcn.using_unified_queue) atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt); atomic_dec(&ring->adev->vcn.total_submission_cnt); -- cgit From 6e169c7e0f842c48c7bf683fb789dbf5a8b1dfd8 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Mon, 15 Jul 2024 13:53:17 -0600 Subject: drm/amd/display: Add doc entry for program_3dlut_size Fixes the warning: Function parameter or struct member 'program_3dlut_size' not described in 'mpc_funcs' Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/dri-devel/20240715090445.7e9387ec@canb.auug.org.au/ Reviewed-by: Aurabindo Pillai Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h b/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h index 40a9b3471208..3a89cc0cffc1 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h @@ -1039,6 +1039,20 @@ struct mpc_funcs { */ void (*program_lut_mode)(struct mpc *mpc, const enum MCM_LUT_ID id, const enum MCM_LUT_XABLE xable, bool lut_bank_a, int mpcc_id); + /** + * @program_3dlut_size: + * + * Program 3D LUT size. + * + * Parameters: + * - [in/out] mpc - MPC context. + * - [in] is_17x17x17 - is 3dlut 17x17x17 + * - [in] mpcc_id + * + * Return: + * + * void + */ void (*program_3dlut_size)(struct mpc *mpc, bool is_17x17x17, int mpcc_id); }; -- cgit From 0e2c796b49735ee141fbff355b9d02e0189c3c65 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Mon, 15 Jul 2024 16:45:46 -0400 Subject: drm/amd/display: Add function banner for idle_workqueue [Why] htmldocs warning: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h: warning: Function parameter or struct member 'idle_workqueue' not described in 'amdgpu_display_manager'. [How] Add comment section for idle_workqueue with param description. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/dri-devel/20240715090211.736a9b4d@canb.auug.org.au/ Signed-off-by: Roman Li Reviewed-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 369159c29bbe..2d7755e2b6c3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -137,6 +137,13 @@ struct vblank_control_work { bool enable; }; +/** + * struct idle_workqueue - Work data for periodic action in idle + * @work: Kernel work data for the work event + * @dm: amdgpu display manager device + * @enable: true if idle worker is enabled + * @running: true if idle worker is running + */ struct idle_workqueue { struct work_struct work; struct amdgpu_display_manager *dm; @@ -502,6 +509,12 @@ struct amdgpu_display_manager { * Deferred work for vblank control events. */ struct workqueue_struct *vblank_control_workqueue; + + /** + * @idle_workqueue: + * + * Periodic work for idle events. + */ struct idle_workqueue *idle_workqueue; struct drm_atomic_state *cached_state; -- cgit From b3fb79cda5688a44a423c27b791f5456d801e49c Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 12 Apr 2024 13:46:03 +0800 Subject: drm/amdgpu: add mutex to protect ras shared memory Add mutex to protect ras shared memory. v2: Add TA_RAS_COMMAND__TRIGGER_ERROR command call status check. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 123 +++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 2 + 3 files changed, 86 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 800cc7a148b2..7cdff355cedb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1591,6 +1591,68 @@ static void psp_ras_ta_check_status(struct psp_context *psp) } } +static int psp_ras_send_cmd(struct psp_context *psp, + enum ras_command cmd_id, void *in, void *out) +{ + struct ta_ras_shared_memory *ras_cmd; + uint32_t cmd = cmd_id; + int ret = 0; + + if (!in) + return -EINVAL; + + mutex_lock(&psp->ras_context.mutex); + ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; + memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); + + switch (cmd) { + case TA_RAS_COMMAND__ENABLE_FEATURES: + case TA_RAS_COMMAND__DISABLE_FEATURES: + memcpy(&ras_cmd->ras_in_message, + in, sizeof(ras_cmd->ras_in_message)); + break; + case TA_RAS_COMMAND__TRIGGER_ERROR: + memcpy(&ras_cmd->ras_in_message.trigger_error, + in, sizeof(ras_cmd->ras_in_message.trigger_error)); + break; + case TA_RAS_COMMAND__QUERY_ADDRESS: + memcpy(&ras_cmd->ras_in_message.address, + in, sizeof(ras_cmd->ras_in_message.address)); + break; + default: + dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd); + ret = -EINVAL; + goto err_out; + } + + ras_cmd->cmd_id = cmd; + ret = psp_ras_invoke(psp, ras_cmd->cmd_id); + + switch (cmd) { + case TA_RAS_COMMAND__TRIGGER_ERROR: + if (ret || psp->cmd_buf_mem->resp.status) + ret = -EINVAL; + else if (out) + memcpy(out, &ras_cmd->ras_status, sizeof(ras_cmd->ras_status)); + break; + case TA_RAS_COMMAND__QUERY_ADDRESS: + if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status) + ret = -EINVAL; + else if (out) + memcpy(out, + &ras_cmd->ras_out_message.address, + sizeof(ras_cmd->ras_out_message.address)); + break; + default: + break; + } + +err_out: + mutex_unlock(&psp->ras_context.mutex); + + return ret; +} + int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id) { struct ta_ras_shared_memory *ras_cmd; @@ -1632,23 +1694,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id) int psp_ras_enable_features(struct psp_context *psp, union ta_ras_cmd_input *info, bool enable) { - struct ta_ras_shared_memory *ras_cmd; + enum ras_command cmd_id; int ret; - if (!psp->ras_context.context.initialized) + if (!psp->ras_context.context.initialized || !info) return -EINVAL; - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); - - if (enable) - ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES; - else - ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES; - - ras_cmd->ras_in_message = *info; - - ret = psp_ras_invoke(psp, ras_cmd->cmd_id); + cmd_id = enable ? + TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES; + ret = psp_ras_send_cmd(psp, cmd_id, info, NULL); if (ret) return -EINVAL; @@ -1672,6 +1726,8 @@ int psp_ras_terminate(struct psp_context *psp) psp->ras_context.context.initialized = false; + mutex_destroy(&psp->ras_context.mutex); + return ret; } @@ -1756,9 +1812,10 @@ int psp_ras_initialize(struct psp_context *psp) ret = psp_ta_load(psp, &psp->ras_context.context); - if (!ret && !ras_cmd->ras_status) + if (!ret && !ras_cmd->ras_status) { psp->ras_context.context.initialized = true; - else { + mutex_init(&psp->ras_context.mutex); + } else { if (ras_cmd->ras_status) dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status); @@ -1772,12 +1829,12 @@ int psp_ras_initialize(struct psp_context *psp) int psp_ras_trigger_error(struct psp_context *psp, struct ta_ras_trigger_error_input *info, uint32_t instance_mask) { - struct ta_ras_shared_memory *ras_cmd; struct amdgpu_device *adev = psp->adev; int ret; uint32_t dev_mask; + uint32_t ras_status = 0; - if (!psp->ras_context.context.initialized) + if (!psp->ras_context.context.initialized || !info) return -EINVAL; switch (info->block_id) { @@ -1801,13 +1858,8 @@ int psp_ras_trigger_error(struct psp_context *psp, dev_mask &= AMDGPU_RAS_INST_MASK; info->sub_block_index |= dev_mask; - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); - - ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR; - ras_cmd->ras_in_message.trigger_error = *info; - - ret = psp_ras_invoke(psp, ras_cmd->cmd_id); + ret = psp_ras_send_cmd(psp, + TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status); if (ret) return -EINVAL; @@ -1817,9 +1869,9 @@ int psp_ras_trigger_error(struct psp_context *psp, if (amdgpu_ras_intr_triggered()) return 0; - if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED) + if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED) return -EACCES; - else if (ras_cmd->ras_status) + else if (ras_status) return -EINVAL; return 0; @@ -1829,25 +1881,16 @@ int psp_ras_query_address(struct psp_context *psp, struct ta_ras_query_address_input *addr_in, struct ta_ras_query_address_output *addr_out) { - struct ta_ras_shared_memory *ras_cmd; int ret; - if (!psp->ras_context.context.initialized) - return -EINVAL; - - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); - - ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS; - ras_cmd->ras_in_message.address = *addr_in; - - ret = psp_ras_invoke(psp, ras_cmd->cmd_id); - if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status) + if (!psp->ras_context.context.initialized || + !addr_in || !addr_out) return -EINVAL; - *addr_out = ras_cmd->ras_out_message.address; + ret = psp_ras_send_cmd(psp, + TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out); - return 0; + return ret; } // ras end diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 3635303e6548..74a96516c913 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -200,6 +200,7 @@ struct psp_xgmi_context { struct psp_ras_context { struct ta_context context; struct amdgpu_ras *ras; + struct mutex mutex; }; #define MEM_TRAIN_SYSTEM_SIGNATURE 0x54534942 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c index 8e8afbd237bc..0c856005df6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c @@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size context->session_id = ta_id; + mutex_lock(&psp->ras_context.mutex); ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len); if (ret) goto err_free_shared_buf; @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size ret = -EFAULT; err_free_shared_buf: + mutex_unlock(&psp->ras_context.mutex); kfree(shared_buf); return ret; -- cgit From 2fdc99b96ea86c178eb14bc948dac93feffc8936 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Fri, 12 Jul 2024 11:05:07 +0800 Subject: drm/amd/pm: early return if disabling DPMS for GFX IP v11.5.2 This was intended to add support for GFX IP v11.5.2, but it needs to be applied to all GFX11 and subsequent APUs. Therefore the code should be revised to accommodate this. Signed-off-by: Tim Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index fb8643d25d1b..9d7454b3c314 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1924,20 +1924,12 @@ static int smu_disable_dpms(struct smu_context *smu) } /* - * For SMU 13.0.4/11 and 14.0.0, PMFW will handle the features disablement properly + * For GFX11 and subsequent APUs, PMFW will handle the features disablement properly * for gpu reset and S0i3 cases. Driver involvement is unnecessary. */ - if (amdgpu_in_reset(adev) || adev->in_s0ix) { - switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { - case IP_VERSION(13, 0, 4): - case IP_VERSION(13, 0, 11): - case IP_VERSION(14, 0, 0): - case IP_VERSION(14, 0, 1): - return 0; - default: - break; - } - } + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) >= 11 && + smu->is_apu && (amdgpu_in_reset(adev) || adev->in_s0ix)) + return 0; /* * For gpu reset, runpm and hibernation through BACO, -- cgit From 5ae8fb971201d281cc594dba58787406e46c696a Mon Sep 17 00:00:00 2001 From: Li Ma Date: Wed, 10 Jul 2024 17:29:59 +0800 Subject: drm/amd/swsmu: enable Pstates profile levels for SMU v14.0.4 Enables following UMD stable Pstates profile levels of power_dpm_force_performance_level for SMU v14.0.4. - profile_peak - profile_min_mclk - profile_min_sclk - profile_standard Signed-off-by: Li Ma Reviewed-by: Tim Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c index 5d47d58944f6..8798ebfcea83 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c @@ -69,6 +69,9 @@ #define SMU_14_0_0_UMD_PSTATE_SOCCLK 678 #define SMU_14_0_0_UMD_PSTATE_FCLK 1800 +#define SMU_14_0_4_UMD_PSTATE_GFXCLK 938 +#define SMU_14_0_4_UMD_PSTATE_SOCCLK 938 + #define FEATURE_MASK(feature) (1ULL << feature) #define SMC_DPM_FEATURE ( \ FEATURE_MASK(FEATURE_CCLK_DPM_BIT) | \ @@ -1296,19 +1299,28 @@ static int smu_v14_0_common_get_dpm_profile_freq(struct smu_context *smu, switch (clk_type) { case SMU_GFXCLK: case SMU_SCLK: - clk_limit = SMU_14_0_0_UMD_PSTATE_GFXCLK; + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 4)) + clk_limit = SMU_14_0_4_UMD_PSTATE_GFXCLK; + else + clk_limit = SMU_14_0_0_UMD_PSTATE_GFXCLK; if (level == AMD_DPM_FORCED_LEVEL_PROFILE_PEAK) smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &clk_limit); else if (level == AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK) smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &clk_limit, NULL); break; case SMU_SOCCLK: - clk_limit = SMU_14_0_0_UMD_PSTATE_SOCCLK; + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 4)) + clk_limit = SMU_14_0_4_UMD_PSTATE_SOCCLK; + else + clk_limit = SMU_14_0_0_UMD_PSTATE_SOCCLK; if (level == AMD_DPM_FORCED_LEVEL_PROFILE_PEAK) smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &clk_limit); break; case SMU_FCLK: - clk_limit = SMU_14_0_0_UMD_PSTATE_FCLK; + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 4)) + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &clk_limit); + else + clk_limit = SMU_14_0_0_UMD_PSTATE_FCLK; if (level == AMD_DPM_FORCED_LEVEL_PROFILE_PEAK) smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &clk_limit); else if (level == AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK) -- cgit From 4cf300f604fe894e4bd734f87fa4502faf1b8af3 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 15 Jul 2024 14:57:49 -0600 Subject: drm/amd/display: Move DIO documentation to the right place When building the kernel-doc, it complains with the below warning: ./drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h:1: warning: no structured comments found ./drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h:1: warning: no structured comments found This warning was caused by the wrong use of the ':export:' and the lack of function documentation in the file pointed under the ':internal:'. This commit addresses those issues by relocating the overview documentation to the correct C file, removing the ':export:' options, and adding two simple kernel-doc to ensure that ':internal:' does not have any warning. Cc: Alex Deucher Acked-by: Alex Deucher Reported-by: Stephen Rothwell Link: https://lore.kernel.org/dri-devel/20240715085918.68f5ecc9@canb.auug.org.au/ Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu/display/dcn-blocks.rst | 7 ++---- .../drm/amd/display/dc/link/hwss/link_hwss_dio.c | 29 ++++++++++++++++++++++ .../drm/amd/display/dc/link/hwss/link_hwss_dio.h | 9 ------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst index a3fbd3ea028b..118aeb9fd2b4 100644 --- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst +++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst @@ -68,11 +68,8 @@ OPP DIO --- -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h +.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h - :export: - -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h +.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c :internal: diff --git a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c index 50459d7a0f85..b76737b7b9e4 100644 --- a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c +++ b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c @@ -26,6 +26,16 @@ #include "core_types.h" #include "link_enc_cfg.h" +/** + * DOC: overview + * + * Display Input Output (DIO), is the display input and output unit in DCN. It + * includes output encoders to support different display output, like + * DisplayPort, HDMI, DVI interface, and others. It also includes the control + * and status channels for these interfaces. + */ + + void set_dio_throttled_vcp_size(struct pipe_ctx *pipe_ctx, struct fixed31_32 throttled_vcp_size) { @@ -254,12 +264,31 @@ static const struct link_hwss dio_link_hwss = { }, }; +/** + * can_use_dio_link_hwss - Check if the link_hwss is accessible + * + * @link: Reference a link struct containing one or more sinks and the + * connective status. + * @link_res: Mappable hardware resource used to enable a link. + * + * Returns: + * Return true if the link encoder is accessible from link. + */ bool can_use_dio_link_hwss(const struct dc_link *link, const struct link_resource *link_res) { return link->link_enc != NULL; } +/** + * get_dio_link_hwss - Return link_hwss reference + * + * This function behaves like a get function to return the link_hwss populated + * in the link_hwss_dio.c file. + * + * Returns: + * Return the reference to the filled struct of link_hwss. + */ const struct link_hwss *get_dio_link_hwss(void) { return &dio_link_hwss; diff --git a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h index a1f72fe378ee..45f0e091fcb0 100644 --- a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h +++ b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h @@ -23,15 +23,6 @@ * */ -/** - * DOC: overview - * - * Display Input Output (DIO), is the display input and output unit in DCN. It - * includes output encoders to support different display output, like - * DisplayPort, HDMI, DVI interface, and others. It also includes the control - * and status channels for these interfaces. - */ - #ifndef __LINK_HWSS_DIO_H__ #define __LINK_HWSS_DIO_H__ -- cgit From 9947dbf984dd79d644a5468a47a76d9fef9f0884 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 15 Jul 2024 15:10:32 -0600 Subject: Documentation/gpu: Remove ':export:' option from DCN documentation This commit reduces, but does not fix, all the occurrences and some of the documentation warnings related to the 'no structured comments.' This was caused by the wrong use of the ':export:' option in the DCN kernel-doc, so this commit drops the usage of those options. Acked-by: Alex Deucher Reported-by: Stephen Rothwell Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu/display/dcn-blocks.rst | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst index 118aeb9fd2b4..582a5fee7f29 100644 --- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst +++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst @@ -11,9 +11,6 @@ DCHUBBUB .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h - :export: - .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :internal: @@ -23,9 +20,6 @@ HUBP .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h - :export: - .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :internal: @@ -35,9 +29,6 @@ DPP .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h - :export: - .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :internal: @@ -47,9 +38,6 @@ MPC .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h - :export: - .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h :internal: @@ -59,9 +47,6 @@ OPP .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/opp.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/opp.h - :export: - .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/opp.h :internal: -- cgit From 1200bce4de8798ff73d1a0011e84b5ff68ffc25c Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 15 Jul 2024 15:58:40 -0600 Subject: Documentation/gpu: Adjust DCN documentation paths When building the kernel-doc, it has the following complaints: Documentation/gpu/amdgpu/display/dcn-blocks:23: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h:3: WARNING: Duplicate C declaration, also defined at gpu/amdgpu/display/dcn-blocks:3. Declaration is '.. c:struct:: surface_flip_registers'. Documentation/gpu/amdgpu/display/dcn-blocks:35: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h:3: WARNING: Duplicate C declaration, also defined at gpu/amdgpu/display/dcn-blocks:3. Declaration is '.. c:struct:: surface_flip_registers'. This error happened due to a copy-and-paste where the same file path was duplicated multiple times to a different set of blocks. This commit addresses this issue by using the correct file path. Cc: Alex Deucher Acked-by: Alex Deucher Reported-by: Stephen Rothwell Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu/display/dcn-blocks.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst index 582a5fee7f29..00bc0607e98c 100644 --- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst +++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst @@ -8,10 +8,10 @@ and the code documentation when it is automatically generated. DCHUBBUB -------- -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h +.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h +.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h :internal: HUBP @@ -26,10 +26,10 @@ HUBP DPP --- -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h +.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h +.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h :internal: MPC -- cgit From d938ec1a12a22a4eedff319aa41cba48e9c5e544 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 15 Jul 2024 16:16:12 -0600 Subject: drm/amd/display: Add simple struct doc to remove doc build warning This commit is a part of a series that addresses the following build warning for opp: ./drivers/gpu/drm/amd/display/dc/inc/hw/opp.h:1: warning: no structured comments found ./drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h:1: warning: no structured comments found This commit fixes this issue by adding a simple kernel-doc to a struct in the opp.h and the dpp.h files. Cc: Alex Deucher Acked-by: Alex Deucher Reported-by: Stephen Rothwell Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h | 22 +++++++++++++++++----- drivers/gpu/drm/amd/display/dc/inc/hw/opp.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h index 9ac7fc717a92..0150f2581ee4 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h @@ -147,16 +147,28 @@ struct cnv_color_keyer_params { int color_keyer_blue_high; }; -/* new for dcn2: set the 8bit alpha values based on the 2 bit alpha - *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT0 default: 0b00000000 - *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT1 default: 0b01010101 - *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT2 default: 0b10101010 - *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT3 default: 0b11111111 +/** + * struct cnv_alpha_2bit_lut - Set the 8bit alpha values based on the 2 bit alpha */ struct cnv_alpha_2bit_lut { + /** + * @lut0: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT0. Default: 0b00000000 + */ int lut0; + + /** + * @lut1: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT1. Default: 0b01010101 + */ int lut1; + + /** + * @lut2: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT2. Default: 0b10101010 + */ int lut2; + + /** + * @lut3: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT3. Default: 0b11111111 + */ int lut3; }; diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h index 127fb1a51654..747679cb4944 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h @@ -205,9 +205,24 @@ struct gamma_coefficients { struct fixed31_32 user_brightness; }; +/** + * struct pwl_float_data - Fixed point RGB color + */ struct pwl_float_data { + /** + * @r: Component Red. + */ struct fixed31_32 r; + + /** + * @g: Component Green. + */ + struct fixed31_32 g; + + /** + * @b: Component Blue. + */ struct fixed31_32 b; }; -- cgit From 91ca34cba3b6a13df6304f80bbcfdf6cad328c2b Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 15 Jul 2024 16:29:41 -0600 Subject: Documentation/gpu: Remove undocumented files from dcn-blockshubbub.h The dchubbub.h and hubp.h do not have any meaningful documentation; for this reason, this commit removes those files from the dcn-blocks documentation. Cc: Alex Deucher Acked-by: Alex Deucher Reported-by: Stephen Rothwell Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu/display/dcn-blocks.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst index 00bc0607e98c..f80df596ef5c 100644 --- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst +++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst @@ -11,18 +11,12 @@ DCHUBBUB .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h - :internal: - HUBP ---- .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h :doc: overview -.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h - :internal: - DPP --- -- cgit From 4b0eb9ce15912ec303cc515888691b6ec6c32196 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 15 Jul 2024 16:57:47 -0600 Subject: Documentation/amdgpu: Fix duplicate declaration Address the below kernel doc warning: Documentation/gpu/amdgpu/display/display-manager:134: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h:3: WARNING: Duplicate C declaration, also defined at gpu/amdgpu/display/dcn-blocks:101. Declaration is '.. c:struct:: mpcc_blnd_cfg'. Documentation/gpu/amdgpu/display/display-manager:146: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h:3: WARNING: Duplicate C declaration, also defined at gpu/amdgpu/display/dcn-blocks:3. Declaration is '.. c:enum:: mpcc_alpha_blend_mode'. To address the above warnings, this commit uses the 'no-identifiers' option in the dcn-blocks to avoid duplication with the previous use of this function doc in the display-manager file. Finally, replaces the deprecated ':function:' in favor of ':identifiers:'. Cc: Alex Deucher Acked-by: Alex Deucher Reported-by: Stephen Rothwell Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu/display/dcn-blocks.rst | 1 + Documentation/gpu/amdgpu/display/display-manager.rst | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst index f80df596ef5c..5e34366f6dbe 100644 --- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst +++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst @@ -34,6 +34,7 @@ MPC .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h :internal: + :no-identifiers: mpcc_blnd_cfg mpcc_alpha_blend_mode OPP --- diff --git a/Documentation/gpu/amdgpu/display/display-manager.rst b/Documentation/gpu/amdgpu/display/display-manager.rst index 67a811e6891f..b269ff3f7a54 100644 --- a/Documentation/gpu/amdgpu/display/display-manager.rst +++ b/Documentation/gpu/amdgpu/display/display-manager.rst @@ -132,7 +132,7 @@ The DRM blend mode and its elements are then mapped by AMDGPU display manager (MPC), as follows: .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h - :functions: mpcc_blnd_cfg + :identifiers: mpcc_blnd_cfg Therefore, the blending configuration for a single MPCC instance on the MPC tree is defined by :c:type:`mpcc_blnd_cfg`, where @@ -144,7 +144,7 @@ alpha and plane alpha values. It sets one of the three modes for :c:type:`MPCC_ALPHA_BLND_MODE`, as described below. .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h - :functions: mpcc_alpha_blend_mode + :identifiers: mpcc_alpha_blend_mode DM then maps the elements of `enum mpcc_alpha_blend_mode` to those in the DRM blend formula, as follows: -- cgit From 69eced9eb49aa6a55c9ef5745b0826fe0b74ff0f Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 13:25:28 -0700 Subject: virtio: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/virtio/virtio_dma_buf.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Message-Id: <20240602-md-virtio_dma_buf-v1-1-ce602d47e257@quicinc.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_dma_buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c index 2521a75009c3..3034a2f605c8 100644 --- a/drivers/virtio/virtio_dma_buf.c +++ b/drivers/virtio/virtio_dma_buf.c @@ -85,5 +85,6 @@ int virtio_dma_buf_get_uuid(struct dma_buf *dma_buf, } EXPORT_SYMBOL(virtio_dma_buf_get_uuid); +MODULE_DESCRIPTION("dma-bufs for virtio exported objects"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS(DMA_BUF); -- cgit From e14f480df7fbcb0c84ce9142c861701495d9b6e2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 12 Jul 2024 09:06:12 -0500 Subject: vdpa/octeon_ep: Fix error code in octep_process_mbox() Return -EINVAL for invalid signatures. Don't return success. Fixes: 8b6c724cdab8 ("virtio: vdpa: vDPA driver for Marvell OCTEON DPU devices") Signed-off-by: Dan Carpenter Message-Id: <623e885b-1a05-479e-ab97-01bcf10bf5b8@stanley.mountain> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/octeon_ep/octep_vdpa_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c index 7fa0491bb201..11bd76ae18cf 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_hw.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_hw.c @@ -140,7 +140,7 @@ static int octep_process_mbox(struct octep_hw *oct_hw, u16 id, u16 qid, void *bu val = octep_read_sig(mbox); if ((val & 0xFFFF) != MBOX_RSP_SIG) { dev_warn(&pdev->dev, "Invalid Signature from mbox : %d response\n", id); - return ret; + return -EINVAL; } val = octep_read_sts(mbox); -- cgit From de128f3f6317b8568311d9f9eb5e0425ebc23a03 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:40 +0200 Subject: virtio_pci: push out single vq find code to vp_find_one_vq_msix() In order to be reused for admin queue setup, push out common code to setup and configure irq for one vq into a separate helper. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-2-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 68 +++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7d82facafd75..277591a6ab96 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -284,6 +284,44 @@ void vp_del_vqs(struct virtio_device *vdev) vp_dev->vqs = NULL; } +static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, + int queue_idx, + vq_callback_t *callback, + const char *name, bool ctx, + int *allocated_vectors) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq; + u16 msix_vec; + int err; + + if (!callback) + msix_vec = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + msix_vec = (*allocated_vectors)++; + else + msix_vec = VP_MSIX_VQ_VECTOR; + vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec); + if (IS_ERR(vq)) + return vq; + + if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + return vq; + + /* allocate per-vq irq if available and necessary */ + snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), + "%s-%s", dev_name(&vp_dev->vdev.dev), name); + err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), + vring_interrupt, 0, + vp_dev->msix_names[msix_vec], vq); + if (err) { + vp_del_vq(vq); + return ERR_PTR(err); + } + + return vq; +} + static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], @@ -292,7 +330,6 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue_info *vqi; - u16 msix_vec; int i, err, nvectors, allocated_vectors, queue_idx = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); @@ -325,36 +362,13 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, vqs[i] = NULL; continue; } - - if (!vqi->callback) - msix_vec = VIRTIO_MSI_NO_VECTOR; - else if (vp_dev->per_vq_vectors) - msix_vec = allocated_vectors++; - else - msix_vec = VP_MSIX_VQ_VECTOR; - vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, - vqi->name, vqi->ctx, msix_vec); + vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, + vqi->name, vqi->ctx, + &allocated_vectors); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } - - if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) - continue; - - /* allocate per-vq irq if available and necessary */ - snprintf(vp_dev->msix_names[msix_vec], - sizeof *vp_dev->msix_names, - "%s-%s", - dev_name(&vp_dev->vdev.dev), vqi->name); - err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), - vring_interrupt, 0, - vp_dev->msix_names[msix_vec], - vqs[i]); - if (err) { - vp_del_vq(vqs[i]); - goto error_find; - } } return 0; -- cgit From 572864d45d61c69aa99461e944f43d3b95c5e159 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:41 +0200 Subject: virtio_pci: simplify vp_request_msix_vectors() call a bit Pass desc arg as it is always to vp_request_msix_vectors(). There rely on per_vq_vectors arg and null desc in case it is false. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-3-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 277591a6ab96..cda0d35dd9e1 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -125,6 +125,9 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, GFP_KERNEL)) goto error; + if (!per_vq_vectors) + desc = NULL; + if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ @@ -349,8 +352,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, nvectors = 2; } - err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, - per_vq_vectors ? desc : NULL); + err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; -- cgit From ef775b2627c83f7c521c38fdcf972567c55e4550 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:42 +0200 Subject: virtio_pci: pass vector policy enum to vp_find_vqs_msix() In preparation for another irq allocation fallback, introduce vector policy enum and pass the values to vp_find_vqs_msix() instead of bool arg. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-4-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index cda0d35dd9e1..7fec2258d125 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -287,6 +287,11 @@ void vp_del_vqs(struct virtio_device *vdev) vp_dev->vqs = NULL; } +enum vp_vq_vector_policy { + VP_VQ_VECTOR_POLICY_EACH, + VP_VQ_VECTOR_POLICY_SHARED, +}; + static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, @@ -328,17 +333,20 @@ static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], - bool per_vq_vectors, + enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; + bool per_vq_vectors; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; + per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; + if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; @@ -427,11 +435,13 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, int err; /* Try MSI-X with one vector per queue. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, true, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, + VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, false, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, + VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ -- cgit From b8b4e1a05d41c282e63a1772f00e376f6fb84410 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:43 +0200 Subject: virtio_pci: pass vector policy enum to vp_find_one_vq_msix() Instead of accessing vp_dev->per_vq_vectors, pass vector policy enum as an argument of vp_find_one_vq_msix() in preparation for another irq allocation policy. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-5-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7fec2258d125..628f003db79b 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -292,11 +292,11 @@ enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_SHARED, }; -static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, - int queue_idx, - vq_callback_t *callback, - const char *name, bool ctx, - int *allocated_vectors) +static struct virtqueue * +vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, + vq_callback_t *callback, const char *name, bool ctx, + int *allocated_vectors, + enum vp_vq_vector_policy vector_policy) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; @@ -305,7 +305,7 @@ static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; - else if (vp_dev->per_vq_vectors) + else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH) msix_vec = (*allocated_vectors)++; else msix_vec = VP_MSIX_VQ_VECTOR; @@ -313,7 +313,8 @@ static struct virtqueue *vp_find_one_vq_msix(struct virtio_device *vdev, if (IS_ERR(vq)) return vq; - if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR) + if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || + msix_vec == VIRTIO_MSI_NO_VECTOR) return vq; /* allocate per-vq irq if available and necessary */ @@ -374,7 +375,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, - &allocated_vectors); + &allocated_vectors, vector_policy); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; -- cgit From 06909a4427d6d57757c0a97a4a4be0aea363caab Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:44 +0200 Subject: virtio_pci: introduce vector allocation fallback for slow path virtqueues If there is not enough vectors to satisfy all virtqueues, currently the fallback is to use one vector for all virtqueues. That may be unnecessary in some cases, when there is enough vectors per data queues. Introduce another fallback policy that tries to allocate vector for all data queues, however for slow path queues (control/admin) it shares config vector. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-6-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 53 +++++++++++++++++++++++++++++++++----- drivers/virtio/virtio_pci_common.h | 7 +++-- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 628f003db79b..cb8bbe6c1a9f 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -46,12 +46,26 @@ bool vp_notify(struct virtqueue *vq) return true; } +/* Notify all slow path virtqueues on an interrupt. */ +static void vp_vring_slow_path_interrupt(int irq, + struct virtio_pci_device *vp_dev) +{ + struct virtio_pci_vq_info *info; + unsigned long flags; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_for_each_entry(info, &vp_dev->slow_virtqueues, node) + vring_interrupt(irq, info->vq); + spin_unlock_irqrestore(&vp_dev->lock, flags); +} + /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); + vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } @@ -174,6 +188,11 @@ error: return err; } +static bool vp_is_slow_path_vector(u16 msix_vec) +{ + return msix_vec == VP_MSIX_CONFIG_VECTOR; +} + static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, @@ -197,7 +216,10 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); - list_add(&info->node, &vp_dev->virtqueues); + if (!vp_is_slow_path_vector(msix_vec)) + list_add(&info->node, &vp_dev->virtqueues); + else + list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); @@ -245,7 +267,8 @@ void vp_del_vqs(struct virtio_device *vdev) if (vp_dev->per_vq_vectors) { int v = vp_dev->vqs[vq->index]->msix_vector; - if (v != VIRTIO_MSI_NO_VECTOR) { + if (v != VIRTIO_MSI_NO_VECTOR && + !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); @@ -289,13 +312,14 @@ void vp_del_vqs(struct virtio_device *vdev) enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, + VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, - int *allocated_vectors, + bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -305,8 +329,13 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; - else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH) + else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || + (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && + !slow_path)) msix_vec = (*allocated_vectors)++; + else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && + slow_path) + msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec); @@ -314,7 +343,8 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || - msix_vec == VIRTIO_MSI_NO_VECTOR) + msix_vec == VIRTIO_MSI_NO_VECTOR || + vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ @@ -374,7 +404,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, - vqi->name, vqi->ctx, + vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); @@ -440,6 +470,13 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; + /* Fallback: MSI-X with one shared vector for config and + * slow path queues, one vector per queue for the rest. + */ + err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, + VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); + if (!err) + return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); @@ -493,7 +530,8 @@ const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || - vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR) + vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || + vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, @@ -601,6 +639,7 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); + INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 3c4bb2d6163a..0cb6fdc8c354 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -35,7 +35,7 @@ struct virtio_pci_vq_info { /* the actual virtqueue */ struct virtqueue *vq; - /* the list node for the virtqueues list */ + /* the list node for the virtqueues or slow_virtqueues list */ struct list_head node; /* MSI-X vector (or none) */ @@ -66,9 +66,12 @@ struct virtio_pci_device { /* Where to read and clear interrupt */ u8 __iomem *isr; - /* a list of queues so we can dispatch IRQs */ + /* Lists of queues and potentially slow path queues + * so we can dispatch IRQs. + */ spinlock_t lock; struct list_head virtqueues; + struct list_head slow_virtqueues; /* Array of all virtqueues reported in the * PCI common config num_queues field -- cgit From 4199107e39122e49bbddafb50932943127b63aee Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:45 +0200 Subject: virtio_pci_modern: treat vp_dev->admin_vq.info.vq pointer as static It is guaranteed by the virtio_pci and PCI layers that none of the VFs is probed before setup_vq() is called for admin queue and after del_vq() is called for admin queue. Therefore treat vp_dev->admin_vq.info.vq as static, don't null it and don't take cmd lock during assign. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-7-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.h | 2 +- drivers/virtio/virtio_pci_modern.c | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 0cb6fdc8c354..aa43875b6353 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -45,7 +45,7 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ struct virtio_pci_vq_info info; - /* serializing admin commands execution and virtqueue deletion */ + /* serializing admin commands execution. */ struct mutex cmd_lock; u64 supported_cmds; /* Name of the admin queue: avq.$vq_index. */ diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 3b5b9499a53a..6d653bea9a87 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -580,11 +580,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, goto err; } - if (is_avq) { - mutex_lock(&vp_dev->admin_vq.cmd_lock); + if (is_avq) vp_dev->admin_vq.info.vq = vq; - mutex_unlock(&vp_dev->admin_vq.cmd_lock); - } return vq; @@ -620,12 +617,6 @@ static void del_vq(struct virtio_pci_vq_info *info) struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - if (vp_is_avq(&vp_dev->vdev, vq->index)) { - mutex_lock(&vp_dev->admin_vq.cmd_lock); - vp_dev->admin_vq.info.vq = NULL; - mutex_unlock(&vp_dev->admin_vq.cmd_lock); - } - if (vp_dev->msix_enabled) vp_modern_queue_vector(mdev, vq->index, VIRTIO_MSI_NO_VECTOR); -- cgit From 7e5d9f556edca687acb04f171af1061042a35927 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:46 +0200 Subject: virtio: push out code to vp_avq_index() To prepare for the follow-up patch to use the code as an op, push out the code that gets admin virtqueue base index and count to a separate helper. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-8-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 6d653bea9a87..d704947b1aec 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -28,6 +28,21 @@ static u64 vp_get_features(struct virtio_device *vdev) return vp_modern_get_features(&vp_dev->mdev); } +static int vp_avq_index(struct virtio_device *vdev, u16 *index, u16 *num) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + *num = 0; + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return 0; + + *num = vp_modern_avq_num(&vp_dev->mdev); + if (!(*num)) + return -EINVAL; + *index = vp_modern_avq_index(&vp_dev->mdev); + return 0; +} + static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -729,19 +744,15 @@ static bool vp_get_shm_region(struct virtio_device *vdev, static int vp_modern_create_avq(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *avq; + struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue *vq; - u16 admin_q_num; - - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) - return 0; + u16 num; + int err; - admin_q_num = vp_modern_avq_num(&vp_dev->mdev); - if (!admin_q_num) - return -EINVAL; + err = vp_avq_index(vdev, &avq->vq_index, &num); + if (err || !num) + return err; - avq = &vp_dev->admin_vq; - avq->vq_index = vp_modern_avq_index(&vp_dev->mdev); sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_dev->setup_vq(vp_dev, &vp_dev->admin_vq.info, avq->vq_index, NULL, avq->name, NULL, VIRTIO_MSI_NO_VECTOR); -- cgit From 89a1c435aec269d109ed46ca7bbb10fa8edf7ace Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:47 +0200 Subject: virtio_pci: pass vq info as an argument to vp_setup_vq() Instead vp_setup_vq() storing vq info directly to vp_dev->vqs, let the caller provide a pointer to store the info to. This prepares vp_setup_vq() to be able to store admin queue info as well. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-9-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index cb8bbe6c1a9f..7c5516ae1f8a 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -197,7 +197,8 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in void (*callback)(struct virtqueue *vq), const char *name, bool ctx, - u16 msix_vec) + u16 msix_vec, + struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); @@ -225,7 +226,7 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in INIT_LIST_HEAD(&info->node); } - vp_dev->vqs[index] = info; + *p_info = info; return vq; out_info: @@ -320,7 +321,8 @@ static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, - enum vp_vq_vector_policy vector_policy) + enum vp_vq_vector_policy vector_policy, + struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; @@ -338,7 +340,8 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; - vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec); + vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, + p_info); if (IS_ERR(vq)) return vq; @@ -405,7 +408,8 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, - &allocated_vectors, vector_policy); + &allocated_vectors, vector_policy, + &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; @@ -445,7 +449,7 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, - VIRTIO_MSI_NO_VECTOR); + VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; -- cgit From af22bbe1f4a514c80b89a27252beef033168f4e9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:48 +0200 Subject: virtio: create admin queues alongside other virtqueues Admin virtqueue is just another virtqueue nothing that special about it. The current implementation treats it somehow separate though in terms of creation and deletion. Unify the admin virtqueue creation and deletion flows to be aligned with the rest of virtqueues, creating it from vp_find_vqs_*() helpers. Let the admin virtqueue to be deleted by vp_del_vqs() as the rest. Call vp_find_one_vq_msix() with slow_path argument being "true" to make sure that in case of limited interrupt vectors the config vector is used for admin queue. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-10-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 28 ++--------------- drivers/virtio/virtio_pci_common.c | 43 ++++++++++++++++++++++++-- drivers/virtio/virtio_pci_common.h | 4 +-- drivers/virtio/virtio_pci_modern.c | 63 ++------------------------------------ include/linux/virtio_config.h | 4 --- 5 files changed, 46 insertions(+), 96 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 396d3cd49a1b..835cfbcb59c8 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -305,15 +305,9 @@ static int virtio_dev_probe(struct device *_d) if (err) goto err; - if (dev->config->create_avq) { - err = dev->config->create_avq(dev); - if (err) - goto err; - } - err = drv->probe(dev); if (err) - goto err_probe; + goto err; /* If probe didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) @@ -326,9 +320,6 @@ static int virtio_dev_probe(struct device *_d) return 0; -err_probe: - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; @@ -344,9 +335,6 @@ static void virtio_dev_remove(struct device *_d) drv->remove(dev); - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); - /* Driver should have reset device. */ WARN_ON_ONCE(dev->config->get_status(dev)); @@ -524,9 +512,6 @@ int virtio_device_freeze(struct virtio_device *dev) } } - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); - return 0; } EXPORT_SYMBOL_GPL(virtio_device_freeze); @@ -562,16 +547,10 @@ int virtio_device_restore(struct virtio_device *dev) if (ret) goto err; - if (dev->config->create_avq) { - ret = dev->config->create_avq(dev); - if (ret) - goto err; - } - if (drv->restore) { ret = drv->restore(dev); if (ret) - goto err_restore; + goto err; } /* If restore didn't do it, mark device DRIVER_OK ourselves. */ @@ -582,9 +561,6 @@ int virtio_device_restore(struct virtio_device *dev) return 0; -err_restore: - if (dev->config->destroy_avq) - dev->config->destroy_avq(dev); err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7c5516ae1f8a..267643bb1cd5 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -262,9 +262,6 @@ void vp_del_vqs(struct virtio_device *vdev) int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { - if (vp_dev->is_avq && vp_dev->is_avq(vdev, vq->index)) - continue; - if (vp_dev->per_vq_vectors) { int v = vp_dev->vqs[vq->index]->msix_vector; @@ -371,14 +368,23 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; + struct virtqueue *vq; bool per_vq_vectors; + u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; + if (vp_dev->avq_index) { + err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); + if (err) + goto error_find; + } + per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { @@ -415,6 +421,18 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, goto error_find; } } + + if (!avq_num) + return 0; + sprintf(avq->name, "avq.%u", avq->vq_index); + vq = vp_find_one_vq_msix(vdev, avq->vq_index, NULL, avq->name, false, + true, &allocated_vectors, vector_policy, + &vp_dev->admin_vq.info); + if (IS_ERR(vq)) { + err = PTR_ERR(vq); + goto error_find; + } + return 0; error_find: @@ -427,12 +445,21 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; + struct virtqueue *vq; + u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; + if (vp_dev->avq_index) { + err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); + if (err) + goto out_del_vqs; + } + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) @@ -456,6 +483,16 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, } } + if (!avq_num) + return 0; + sprintf(avq->name, "avq.%u", avq->vq_index); + vq = vp_setup_vq(vdev, queue_idx++, NULL, avq->name, false, + VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); + if (IS_ERR(vq)) { + err = PTR_ERR(vq); + goto out_del_vqs; + } + return 0; out_del_vqs: vp_del_vqs(vdev); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index aa43875b6353..de59bb06ec3c 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -44,7 +44,7 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ - struct virtio_pci_vq_info info; + struct virtio_pci_vq_info *info; /* serializing admin commands execution. */ struct mutex cmd_lock; u64 supported_cmds; @@ -105,7 +105,7 @@ struct virtio_pci_device { void (*del_vq)(struct virtio_pci_vq_info *info); u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector); - bool (*is_avq)(struct virtio_device *vdev, unsigned int index); + int (*avq_index)(struct virtio_device *vdev, u16 *index, u16 *num); }; /* Constants for MSI-X */ diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index d704947b1aec..5ceb4b2c18df 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -63,7 +63,7 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, struct virtqueue *vq; int ret, len; - vq = admin_vq->info.vq; + vq = admin_vq->info->vq; if (!vq) return -EIO; @@ -203,27 +203,12 @@ end: static void vp_modern_avq_activate(struct virtio_device *vdev) { - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return; - __virtqueue_unbreak(admin_vq->info.vq); virtio_pci_admin_cmd_list_init(vdev); } -static void vp_modern_avq_deactivate(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; - - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) - return; - - __virtqueue_break(admin_vq->info.vq); -} - static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -418,8 +403,6 @@ static void vp_reset(struct virtio_device *vdev) while (vp_modern_get_status(mdev)) msleep(1); - vp_modern_avq_deactivate(vdev); - /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); } @@ -595,9 +578,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, goto err; } - if (is_avq) - vp_dev->admin_vq.info.vq = vq; - return vq; err: @@ -741,41 +721,6 @@ static bool vp_get_shm_region(struct virtio_device *vdev, return true; } -static int vp_modern_create_avq(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; - struct virtqueue *vq; - u16 num; - int err; - - err = vp_avq_index(vdev, &avq->vq_index, &num); - if (err || !num) - return err; - - sprintf(avq->name, "avq.%u", avq->vq_index); - vq = vp_dev->setup_vq(vp_dev, &vp_dev->admin_vq.info, avq->vq_index, NULL, - avq->name, NULL, VIRTIO_MSI_NO_VECTOR); - if (IS_ERR(vq)) { - dev_err(&vdev->dev, "failed to setup admin virtqueue, err=%ld", - PTR_ERR(vq)); - return PTR_ERR(vq); - } - - vp_modern_set_queue_enable(&vp_dev->mdev, avq->info.vq->index, true); - return 0; -} - -static void vp_modern_destroy_avq(struct virtio_device *vdev) -{ - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - - if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) - return; - - vp_dev->del_vq(&vp_dev->admin_vq.info); -} - static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get = NULL, .set = NULL, @@ -794,8 +739,6 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get_shm_region = vp_get_shm_region, .disable_vq_and_reset = vp_modern_disable_vq_and_reset, .enable_vq_after_reset = vp_modern_enable_vq_after_reset, - .create_avq = vp_modern_create_avq, - .destroy_avq = vp_modern_destroy_avq, }; static const struct virtio_config_ops virtio_pci_config_ops = { @@ -816,8 +759,6 @@ static const struct virtio_config_ops virtio_pci_config_ops = { .get_shm_region = vp_get_shm_region, .disable_vq_and_reset = vp_modern_disable_vq_and_reset, .enable_vq_after_reset = vp_modern_enable_vq_after_reset, - .create_avq = vp_modern_create_avq, - .destroy_avq = vp_modern_destroy_avq, }; /* the PCI probing function */ @@ -841,7 +782,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->config_vector = vp_config_vector; vp_dev->setup_vq = setup_vq; vp_dev->del_vq = del_vq; - vp_dev->is_avq = vp_is_avq; + vp_dev->avq_index = vp_avq_index; vp_dev->isr = mdev->isr; vp_dev->vdev.id = mdev->id; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index ab4b9a3fef6b..169c7d367fac 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -104,8 +104,6 @@ struct virtqueue_info { * Returns 0 on success or error status * If disable_vq_and_reset is set, then enable_vq_after_reset must also be * set. - * @create_avq: create admin virtqueue resource. - * @destroy_avq: destroy admin virtqueue resource. */ struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, @@ -133,8 +131,6 @@ struct virtio_config_ops { struct virtio_shm_region *region, u8 id); int (*disable_vq_and_reset)(struct virtqueue *vq); int (*enable_vq_after_reset)(struct virtqueue *vq); - int (*create_avq)(struct virtio_device *vdev); - void (*destroy_avq)(struct virtio_device *vdev); }; /* If driver didn't advertise the feature, it will never appear. */ -- cgit From b00c4150c435eed6e80ab302ae3a02a3d41460c3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:49 +0200 Subject: virtio_pci_modern: create admin queue of queried size Don't limit the admin queue size to VIRTIO_AVQ_SGS_MAX and rather rely on the queried queue size. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-11-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 5ceb4b2c18df..a649c9dc436d 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -550,8 +550,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (index >= vp_modern_get_num_queues(mdev) && !is_avq) return ERR_PTR(-EINVAL); - num = is_avq ? - VIRTIO_AVQ_SGS_MAX : vp_modern_get_queue_size(mdev, index); + num = vp_modern_get_queue_size(mdev, index); /* Check if queue is either not available or already active. */ if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); -- cgit From 7090f2b5ad33e9b6ba68eb927b02e8a286d21fb4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:50 +0200 Subject: virtio_pci_modern: pass cmd as an identification token In preparation to asynchronous admin queue processing, pass cmd pointer as a data arg to virtqueue_add_sgs(). Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-12-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index a649c9dc436d..0fd344d1eaf9 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -58,7 +58,7 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, struct scatterlist **sgs, unsigned int out_num, unsigned int in_num, - void *data) + struct virtio_admin_cmd *cmd) { struct virtqueue *vq; int ret, len; @@ -72,7 +72,7 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, !((1ULL << opcode) & admin_vq->supported_cmds)) return -EOPNOTSUPP; - ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, data, GFP_KERNEL); + ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_KERNEL); if (ret < 0) return -EIO; @@ -140,7 +140,7 @@ int vp_modern_admin_cmd_exec(struct virtio_device *vdev, mutex_lock(&vp_dev->admin_vq.cmd_lock); ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, le16_to_cpu(cmd->opcode), - sgs, out_num, in_num, sgs); + sgs, out_num, in_num, cmd); mutex_unlock(&vp_dev->admin_vq.cmd_lock); if (ret) { -- cgit From 4c3b54af907e709609d3d8beca92d65e2f0cfd83 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:51 +0200 Subject: virtio_pci_modern: use completion instead of busy loop to wait on admin cmd result Currently, the code waits in a busy loop on every admin virtqueue issued command to get a reply. That prevents callers from issuing multiple commands in parallel. To overcome this limitation, introduce a virtqueue event callback for admin virtqueue. For every issued command, use completion mechanism to wait on a reply. In the event callback, trigger the completion is done for every incoming reply. Alongside with that, introduce a spin lock to protect the admin virtqueue operations. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-13-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 13 ++++--- drivers/virtio/virtio_pci_common.h | 3 ++ drivers/virtio/virtio_pci_modern.c | 74 ++++++++++++++++++++++++++++++++------ include/linux/virtio.h | 3 ++ 4 files changed, 77 insertions(+), 16 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 267643bb1cd5..c44d8ba00c02 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -395,6 +395,8 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, if (vqi->name && vqi->callback) ++nvectors; } + if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) + ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; @@ -425,9 +427,9 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); - vq = vp_find_one_vq_msix(vdev, avq->vq_index, NULL, avq->name, false, - true, &allocated_vectors, vector_policy, - &vp_dev->admin_vq.info); + vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, + avq->name, false, true, &allocated_vectors, + vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; @@ -486,8 +488,9 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); - vq = vp_setup_vq(vdev, queue_idx++, NULL, avq->name, false, - VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); + vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, + false, VIRTIO_MSI_NO_VECTOR, + &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index de59bb06ec3c..90df381fbbcf 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -47,6 +47,8 @@ struct virtio_pci_admin_vq { struct virtio_pci_vq_info *info; /* serializing admin commands execution. */ struct mutex cmd_lock; + /* Protects virtqueue access. */ + spinlock_t lock; u64 supported_cmds; /* Name of the admin queue: avq.$vq_index. */ char name[10]; @@ -178,6 +180,7 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev); #define VIRTIO_ADMIN_CMD_BITMAP 0 #endif +void vp_modern_avq_done(struct virtqueue *vq); int vp_modern_admin_cmd_exec(struct virtio_device *vdev, struct virtio_admin_cmd *cmd); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 0fd344d1eaf9..608df3263df1 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -53,6 +53,23 @@ static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) return index == vp_dev->admin_vq.vq_index; } +void vp_modern_avq_done(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; + struct virtio_admin_cmd *cmd; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&admin_vq->lock, flags); + do { + virtqueue_disable_cb(vq); + while ((cmd = virtqueue_get_buf(vq, &len))) + complete(&cmd->completion); + } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&admin_vq->lock, flags); +} + static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, u16 opcode, struct scatterlist **sgs, @@ -61,7 +78,8 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, struct virtio_admin_cmd *cmd) { struct virtqueue *vq; - int ret, len; + unsigned long flags; + int ret; vq = admin_vq->info->vq; if (!vq) @@ -72,21 +90,33 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, !((1ULL << opcode) & admin_vq->supported_cmds)) return -EOPNOTSUPP; - ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_KERNEL); - if (ret < 0) - return -EIO; + init_completion(&cmd->completion); - if (unlikely(!virtqueue_kick(vq))) +again: + if (virtqueue_is_broken(vq)) return -EIO; - while (!virtqueue_get_buf(vq, &len) && - !virtqueue_is_broken(vq)) - cpu_relax(); + spin_lock_irqsave(&admin_vq->lock, flags); + ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_KERNEL); + if (ret < 0) { + if (ret == -ENOSPC) { + spin_unlock_irqrestore(&admin_vq->lock, flags); + cpu_relax(); + goto again; + } + goto unlock_err; + } + if (!virtqueue_kick(vq)) + goto unlock_err; + spin_unlock_irqrestore(&admin_vq->lock, flags); - if (virtqueue_is_broken(vq)) - return -EIO; + wait_for_completion(&cmd->completion); - return 0; + return cmd->ret; + +unlock_err: + spin_unlock_irqrestore(&admin_vq->lock, flags); + return -EIO; } int vp_modern_admin_cmd_exec(struct virtio_device *vdev, @@ -209,6 +239,25 @@ static void vp_modern_avq_activate(struct virtio_device *vdev) virtio_pci_admin_cmd_list_init(vdev); } +static void vp_modern_avq_cleanup(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_admin_cmd *cmd; + struct virtqueue *vq; + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return; + + vq = vp_dev->vqs[vp_dev->admin_vq.vq_index]->vq; + if (!vq) + return; + + while ((cmd = virtqueue_detach_unused_buf(vq))) { + cmd->ret = -EIO; + complete(&cmd->completion); + } +} + static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -403,6 +452,8 @@ static void vp_reset(struct virtio_device *vdev) while (vp_modern_get_status(mdev)) msleep(1); + vp_modern_avq_cleanup(vdev); + /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); } @@ -785,6 +836,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->isr = mdev->isr; vp_dev->vdev.id = mdev->id; + spin_lock_init(&vp_dev->admin_vq.lock); mutex_init(&vp_dev->admin_vq.cmd_lock); return 0; } diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 96fea920873b..999ff5934392 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * struct virtqueue - a queue to register buffers for sending or receiving. @@ -109,6 +110,8 @@ struct virtio_admin_cmd { __le64 group_member_id; struct scatterlist *data_sg; struct scatterlist *result_sg; + struct completion completion; + int ret; }; /** -- cgit From 6d834691da474ed1c648753d3d3a3ef8379fa1c1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jul 2024 13:35:52 +0200 Subject: virtio_pci_modern: remove admin queue serialization lock The admin queue operations are protected by newly introduced spin lock. To make it possible to issue parallel commands, remove the admin queue serialization lock. Signed-off-by: Jiri Pirko Message-Id: <20240716113552.80599-14-jiri@resnulli.us> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.h | 2 -- drivers/virtio/virtio_pci_modern.c | 5 ----- 2 files changed, 7 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 90df381fbbcf..1d9c49947f52 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -45,8 +45,6 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ struct virtio_pci_vq_info *info; - /* serializing admin commands execution. */ - struct mutex cmd_lock; /* Protects virtqueue access. */ spinlock_t lock; u64 supported_cmds; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 608df3263df1..9193c30d640a 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -167,12 +167,9 @@ int vp_modern_admin_cmd_exec(struct virtio_device *vdev, in_num++; } - mutex_lock(&vp_dev->admin_vq.cmd_lock); ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, le16_to_cpu(cmd->opcode), sgs, out_num, in_num, cmd); - mutex_unlock(&vp_dev->admin_vq.cmd_lock); - if (ret) { dev_err(&vdev->dev, "Failed to execute command on admin vq: %d\n.", ret); @@ -837,7 +834,6 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->vdev.id = mdev->id; spin_lock_init(&vp_dev->admin_vq.lock); - mutex_init(&vp_dev->admin_vq.cmd_lock); return 0; } @@ -845,6 +841,5 @@ void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - mutex_destroy(&vp_dev->admin_vq.cmd_lock); vp_modern_remove(mdev); } -- cgit From a0328b397f3339d8d17a6ec356e94b3c110b010c Mon Sep 17 00:00:00 2001 From: Foryun Ma Date: Tue, 4 Jun 2024 11:21:51 +0800 Subject: cxl/core/pci: Move reading of control register to immediately before usage Relocate the reading of the DVSEC control register to immediately before usage and avoid unnecessary PCI config access from the read if DVSEC capability check, hdm_count check, or device validity check results in failure. Signed-off-by: Foryun Ma Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240604032151.655-1-foryun.ma@jaguarmicro.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 8567dd11eaac..a663e7566c48 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -338,10 +338,6 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, if (rc) return rc; - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); - if (rc) - return rc; - if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO; @@ -368,6 +364,10 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, * disabled, and they will remain moot after the HDM Decoder * capability is enabled. */ + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc) + return rc; + info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0; -- cgit From e3615bd198289f319172c428f20857accb46b830 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 16 Jul 2024 12:49:25 -0400 Subject: drm/amd/display: fix corruption with high refresh rates on DCN 3.0 This reverts commit bc87d666c05a13e6d4ae1ddce41fc43d2567b9a2 and the register changes from commit 6d4279cb99ac4f51d10409501d29969f687ac8dc. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3412 Cc: mikhail.v.gavrilov@gmail.com Cc: Rodrigo Siqueira Tested-by: Mikhail Gavrilov Reviewed-by: Rodrigo Siqueira Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.10.x --- drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c | 15 +++------------ drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c | 10 ++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c index 336488c0574e..94427875bcdd 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c @@ -945,19 +945,10 @@ void optc1_set_drr( OTG_FORCE_LOCK_ON_EVENT, 0, OTG_SET_V_TOTAL_MIN_MASK_EN, 0, OTG_SET_V_TOTAL_MIN_MASK, 0); - - // Setup manual flow control for EOF via TRIG_A - optc->funcs->setup_manual_trigger(optc); - - } else { - REG_UPDATE_4(OTG_V_TOTAL_CONTROL, - OTG_SET_V_TOTAL_MIN_MASK, 0, - OTG_V_TOTAL_MIN_SEL, 0, - OTG_V_TOTAL_MAX_SEL, 0, - OTG_FORCE_LOCK_ON_EVENT, 0); - - optc->funcs->set_vtotal_min_max(optc, 0, 0); } + + // Setup manual flow control for EOF via TRIG_A + optc->funcs->setup_manual_trigger(optc); } void optc1_set_vtotal_min_max(struct timing_generator *optc, int vtotal_min, int vtotal_max) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c index 43417cff2c9b..b4694985a40a 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c @@ -453,6 +453,16 @@ void optc2_setup_manual_trigger(struct timing_generator *optc) { struct optc *optc1 = DCN10TG_FROM_TG(optc); + /* Set the min/max selectors unconditionally so that + * DMCUB fw may change OTG timings when necessary + * TODO: Remove the w/a after fixing the issue in DMCUB firmware + */ + REG_UPDATE_4(OTG_V_TOTAL_CONTROL, + OTG_V_TOTAL_MIN_SEL, 1, + OTG_V_TOTAL_MAX_SEL, 1, + OTG_FORCE_LOCK_ON_EVENT, 0, + OTG_SET_V_TOTAL_MIN_MASK, (1 << 1)); /* TRIGA */ + REG_SET_8(OTG_TRIGA_CNTL, 0, OTG_TRIGA_SOURCE_SELECT, 21, OTG_TRIGA_SOURCE_PIPE_SELECT, optc->inst, -- cgit From f0d17d696dfce77c9abc830e4ac2d677890a2dad Mon Sep 17 00:00:00 2001 From: Tatsunosuke Tobita Date: Tue, 9 Jul 2024 14:57:28 +0900 Subject: HID: wacom: Modify pen IDs The pen ID, 0x80842, was not the correct ID for wacom driver to treat. The ID was corrected to 0x8842. Also, 0x4200 was not the expected ID used on any Wacom device. Therefore, 0x4200 was removed. Signed-off-by: Tatsunosuke Tobita Signed-off-by: Tatsunosuke Tobita Fixes: bfdc750c4cb2 ("HID: wacom: add three styli to wacom_intuos_get_tool_type") Cc: stable@kernel.org #6.2 Reviewed-by: Ping Cheng Link: https://patch.msgid.link/20240709055729.17158-1-tatsunosuke.wacom@gmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/wacom_wac.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index a44367aef621..20de97ce0f5e 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -714,13 +714,12 @@ static int wacom_intuos_get_tool_type(int tool_id) case 0x8e2: /* IntuosHT2 pen */ case 0x022: case 0x200: /* Pro Pen 3 */ - case 0x04200: /* Pro Pen 3 */ case 0x10842: /* MobileStudio Pro Pro Pen slim */ case 0x14802: /* Intuos4/5 13HD/24HD Classic Pen */ case 0x16802: /* Cintiq 13HD Pro Pen */ case 0x18802: /* DTH2242 Pen */ case 0x10802: /* Intuos4/5 13HD/24HD General Pen */ - case 0x80842: /* Intuos Pro and Cintiq Pro 3D Pen */ + case 0x8842: /* Intuos Pro and Cintiq Pro 3D Pen */ tool_type = BTN_TOOL_PEN; break; -- cgit From 9c2913b962daf3e5a947babf93f2125765eeca09 Mon Sep 17 00:00:00 2001 From: Tatsunosuke Tobita Date: Tue, 9 Jul 2024 14:57:29 +0900 Subject: HID: wacom: more appropriate tool type categorization Recent eraser and pen tools are able to be distinguished by looking at its least significant nibble. This modification applies it to wacom_intuos_get_tool_type function and reduces its code lines. Signed-off-by: Tatsunosuke Tobita Signed-off-by: Tatsunosuke Tobita Reviewed-by: Ping Cheng Link: https://patch.msgid.link/20240709055729.17158-2-tatsunosuke.wacom@gmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/wacom_wac.c | 66 ++++++++----------------------------------------- 1 file changed, 10 insertions(+), 56 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 20de97ce0f5e..1f4564982b95 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -692,77 +692,28 @@ static bool wacom_is_art_pen(int tool_id) static int wacom_intuos_get_tool_type(int tool_id) { - int tool_type = BTN_TOOL_PEN; - - if (wacom_is_art_pen(tool_id)) - return tool_type; - switch (tool_id) { case 0x812: /* Inking pen */ case 0x801: /* Intuos3 Inking pen */ case 0x12802: /* Intuos4/5 Inking Pen */ case 0x012: - tool_type = BTN_TOOL_PENCIL; - break; - - case 0x822: /* Pen */ - case 0x842: - case 0x852: - case 0x823: /* Intuos3 Grip Pen */ - case 0x813: /* Intuos3 Classic Pen */ - case 0x802: /* Intuos4/5 13HD/24HD General Pen */ - case 0x8e2: /* IntuosHT2 pen */ - case 0x022: - case 0x200: /* Pro Pen 3 */ - case 0x10842: /* MobileStudio Pro Pro Pen slim */ - case 0x14802: /* Intuos4/5 13HD/24HD Classic Pen */ - case 0x16802: /* Cintiq 13HD Pro Pen */ - case 0x18802: /* DTH2242 Pen */ - case 0x10802: /* Intuos4/5 13HD/24HD General Pen */ - case 0x8842: /* Intuos Pro and Cintiq Pro 3D Pen */ - tool_type = BTN_TOOL_PEN; - break; + return BTN_TOOL_PENCIL; case 0x832: /* Stroke pen */ case 0x032: - tool_type = BTN_TOOL_BRUSH; - break; + return BTN_TOOL_BRUSH; case 0x007: /* Mouse 4D and 2D */ case 0x09c: case 0x094: case 0x017: /* Intuos3 2D Mouse */ case 0x806: /* Intuos4 Mouse */ - tool_type = BTN_TOOL_MOUSE; - break; + return BTN_TOOL_MOUSE; case 0x096: /* Lens cursor */ case 0x097: /* Intuos3 Lens cursor */ case 0x006: /* Intuos4 Lens cursor */ - tool_type = BTN_TOOL_LENS; - break; - - case 0x82a: /* Eraser */ - case 0x84a: - case 0x85a: - case 0x91a: - case 0xd1a: - case 0x0fa: - case 0x82b: /* Intuos3 Grip Pen Eraser */ - case 0x81b: /* Intuos3 Classic Pen Eraser */ - case 0x91b: /* Intuos3 Airbrush Eraser */ - case 0x80c: /* Intuos4/5 13HD/24HD Marker Pen Eraser */ - case 0x80a: /* Intuos4/5 13HD/24HD General Pen Eraser */ - case 0x90a: /* Intuos4/5 13HD/24HD Airbrush Eraser */ - case 0x1480a: /* Intuos4/5 13HD/24HD Classic Pen Eraser */ - case 0x1090a: /* Intuos4/5 13HD/24HD Airbrush Eraser */ - case 0x1080c: /* Intuos4/5 13HD/24HD Art Pen Eraser */ - case 0x1084a: /* MobileStudio Pro Pro Pen slim Eraser */ - case 0x1680a: /* Cintiq 13HD Pro Pen Eraser */ - case 0x1880a: /* DTH2242 Eraser */ - case 0x1080a: /* Intuos4/5 13HD/24HD General Pen Eraser */ - tool_type = BTN_TOOL_RUBBER; - break; + return BTN_TOOL_LENS; case 0xd12: case 0x912: @@ -770,10 +721,13 @@ static int wacom_intuos_get_tool_type(int tool_id) case 0x913: /* Intuos3 Airbrush */ case 0x902: /* Intuos4/5 13HD/24HD Airbrush */ case 0x10902: /* Intuos4/5 13HD/24HD Airbrush */ - tool_type = BTN_TOOL_AIRBRUSH; - break; + return BTN_TOOL_AIRBRUSH; + + default: + if (tool_id & 0x0008) + return BTN_TOOL_RUBBER; + return BTN_TOOL_PEN; } - return tool_type; } static void wacom_exit_report(struct wacom_wac *wacom) -- cgit From f2038c12e8133bf4c6bd4d1127a23310d55d9e21 Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Thu, 18 Jul 2024 11:50:02 +0530 Subject: ASoC: sof: amd: fix for firmware reload failure in Vangogh platform Setting ACP ACLK as clock source when ACP enters D0 state causing firmware load failure, as per design clock source should be internal clock. Remove acp_clkmux_sel field so that ACP will use internal clock source when ACP enters into D0 state. Fixes: d0dab6b76a9f ("ASoC: SOF: amd: Add sof support for vangogh platform") Signed-off-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20240718062004.581685-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/pci-vangogh.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/sof/amd/pci-vangogh.c b/sound/soc/sof/amd/pci-vangogh.c index 16eb2994fbab..eba580840100 100644 --- a/sound/soc/sof/amd/pci-vangogh.c +++ b/sound/soc/sof/amd/pci-vangogh.c @@ -34,7 +34,6 @@ static const struct sof_amd_acp_desc vangogh_chip_info = { .dsp_intr_base = ACP5X_DSP_SW_INTR_BASE, .sram_pte_offset = ACP5X_SRAM_PTE_OFFSET, .hw_semaphore_offset = ACP5X_AXI2DAGB_SEM_0, - .acp_clkmux_sel = ACP5X_CLKMUX_SEL, .probe_reg_offset = ACP5X_FUTURE_REG_ACLK_0, }; -- cgit From 5170dae5591036dba7daa519ea3126169300e275 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 17 Jul 2024 10:59:48 +0100 Subject: dt-bindings: trivial-devices: fix Rohm BH2228FV compatible string When Maxime originally added the BH2228FV to the spidev driver, he spelt it incorrectly - the d should have been a b. That spelling was then propagated to the binding when written by Krzysztof. Add a new, correctly spelt compatible and advise against using the incorrectly spelling. Fixes: 025aea27732d ("dt-bindings: trivial-devices: document SPI dev compatibles") Signed-off-by: Conor Dooley Acked-by: Maxime Ripard Link: https://patch.msgid.link/20240717-exuberant-enlighten-f890fabcd247@spud Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/trivial-devices.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index 0a419453d183..5d9de7f68afc 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -318,7 +318,9 @@ properties: - renesas,hs3001 # Renesas ISL29501 time-of-flight sensor - renesas,isl29501 - # Rohm DH2228FV + # Rohm BH2228FV 8 channel DAC + - rohm,bh2228fv + # Rohm DH2228FV - This device does not exist, use rohm,bh2228fv instead. - rohm,dh2228fv # S524AD0XF1 (128K/256K-bit Serial EEPROM for Low Power) - samsung,24ad0xd1 -- cgit From fc28d1c1fe3b3e2fbc50834c8f73dda72f6af9fc Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 17 Jul 2024 10:59:49 +0100 Subject: spi: spidev: add correct compatible for Rohm BH2228FV When Maxime originally added the BH2228FV to the spidev driver, he spelt it incorrectly - the d should have been a b. Add the correctly spelt compatible to the driver. Although the majority of users of this compatible are abusers, there is at least one board that validly uses the incorrect spelt compatible, so keep it in the driver to avoid breaking the few real users it has. Fixes: 8fad805bdc52 ("spi: spidev: Add Rohm DH2228FV DAC compatible string") Signed-off-by: Conor Dooley Acked-by: Maxime Ripard Link: https://patch.msgid.link/20240717-ventricle-strewn-a7678c509e85@spud Signed-off-by: Mark Brown --- drivers/spi/spidev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index 95fb5f1c91c1..05e6d007f9a7 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -734,6 +734,7 @@ static const struct of_device_id spidev_dt_ids[] = { { .compatible = "lwn,bk4", .data = &spidev_of_check }, { .compatible = "menlo,m53cpld", .data = &spidev_of_check }, { .compatible = "micron,spi-authenta", .data = &spidev_of_check }, + { .compatible = "rohm,bh2228fv", .data = &spidev_of_check }, { .compatible = "rohm,dh2228fv", .data = &spidev_of_check }, { .compatible = "semtech,sx1301", .data = &spidev_of_check }, { .compatible = "silabs,em3581", .data = &spidev_of_check }, -- cgit From 408c2f14a5d3d7ac4824b96e52693ab271efb738 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 11 Jul 2024 14:12:03 -0700 Subject: drm/xe/exec: Fix minor bug related to xe_sync_entry_cleanup Increment num_syncs after xe_sync_entry_parse() is successful to ensure the xe_sync_entry_cleanup() logic under "err_syncs" label works correctly. v2: Use the same pattern as that in xe_vm.c (Matt Brost) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Ashutosh Dixit Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240711211203.3728180-1-ashutosh.dixit@intel.com (cherry picked from commit 43a6faa6d9b5e9139758200a79fe9c8f4aaa0c8d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 2d72cdec3a0b..f36980aa26e6 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -118,7 +118,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) u64 addresses[XE_HW_ENGINE_MAX_INSTANCE]; struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn}; struct drm_exec *exec = &vm_exec.exec; - u32 i, num_syncs = 0, num_ufence = 0; + u32 i, num_syncs, num_ufence = 0; struct xe_sched_job *job; struct xe_vm *vm; bool write_locked, skip_retry = false; @@ -156,15 +156,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) vm = q->vm; - for (i = 0; i < args->num_syncs; i++) { - err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++], - &syncs_user[i], SYNC_PARSE_FLAG_EXEC | + for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) { + err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs], + &syncs_user[num_syncs], SYNC_PARSE_FLAG_EXEC | (xe_vm_in_lr_mode(vm) ? SYNC_PARSE_FLAG_LR_MODE : 0)); if (err) goto err_syncs; - if (xe_sync_is_ufence(&syncs[i])) + if (xe_sync_is_ufence(&syncs[num_syncs])) num_ufence++; } @@ -325,8 +325,8 @@ err_unlock_list: if (err == -EAGAIN && !skip_retry) goto retry; err_syncs: - for (i = 0; i < num_syncs; i++) - xe_sync_entry_cleanup(&syncs[i]); + while (num_syncs--) + xe_sync_entry_cleanup(&syncs[num_syncs]); kfree(syncs); err_exec_queue: xe_exec_queue_put(q); -- cgit From bf07ca963d4fd11c88a9d4b058f2bd62e8d46a98 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 11 Jul 2024 21:23:19 +0200 Subject: drm/xe/pf: Limit fair VF LMEM provisioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to the current design of the BO and VRAM manager, any object with XE_BO_FLAG_PINNED flag, which the PF driver uses during VF LMEM provisionining, is created with the TTM_PL_FLAG_CONTIGUOUS flag, which may cause VRAM fragmentation that prevents subsequent allocations of larger objects, like fair VF LMEM provisioning. To avoid such failures, round down fair VF LMEM provisioning size to next power of two size, to compensate what xe_ttm_vram_mgr is doing to achieve contiguous allocations. Fixes: ac6598aed1b3 ("drm/xe/pf: Add support to configure SR-IOV VFs") Signed-off-by: Michal Wajdeczko Reviewed-by: Piotr Piórkowski Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240711192320.1198-2-michal.wajdeczko@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4c3fe5eae46b92e2fd961b19f7779608352e5368) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index db6c213da847..4699b7836001 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1543,6 +1543,7 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs) u64 fair; fair = div_u64(available, num_vfs); + fair = rounddown_pow_of_two(fair); /* XXX: ttm_vram_mgr & drm_buddy limitation */ fair = ALIGN_DOWN(fair, alignment); #ifdef MAX_FAIR_LMEM fair = min_t(u64, MAX_FAIR_LMEM, fair); -- cgit From c9474b726b932b5d555effd9ed0ae19f4da2367c Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 15 Jul 2024 23:39:01 -0700 Subject: drm/xe: Wedge the entire device Wedge the entire device, not just GT which may have triggered the wedge. To implement this, cleanup the layering so xe_device_declare_wedged() calls into the lower layers (GT) to ensure entire device is wedged. While we are here, also signal any pending GT TLB invalidations upon wedging device. Lastly, short circuit reset wait if device is wedged. v2: - Short circuit reset wait if device is wedged (Local testing) Fixes: 8ed9aaae39f3 ("drm/xe: Force wedged state and block GT reset upon any GPU hang") Cc: Rodrigo Vivi Signed-off-by: Matthew Brost Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-1-matthew.brost@intel.com (cherry picked from commit 7dbe8af13c189f5937e87e9fb924d5bbc49e6f71) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 6 ++++++ drivers/gpu/drm/xe/xe_gt.c | 15 +++++++++++++++ drivers/gpu/drm/xe/xe_gt.h | 1 + drivers/gpu/drm/xe/xe_guc.c | 16 ++++++++++++++++ drivers/gpu/drm/xe/xe_guc.h | 1 + drivers/gpu/drm/xe/xe_guc_submit.c | 38 +++++++++++++++++++++++++------------- drivers/gpu/drm/xe/xe_guc_submit.h | 1 + drivers/gpu/drm/xe/xe_uc.c | 14 ++++++++++++++ drivers/gpu/drm/xe/xe_uc.h | 1 + 9 files changed, 80 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 03492fbcb8fb..41548eff953d 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -870,6 +870,9 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address) */ void xe_device_declare_wedged(struct xe_device *xe) { + struct xe_gt *gt; + u8 id; + if (xe->wedged.mode == 0) { drm_dbg(&xe->drm, "Wedged mode is forcibly disabled\n"); return; @@ -883,4 +886,7 @@ void xe_device_declare_wedged(struct xe_device *xe) "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", dev_name(xe->drm.dev)); } + + for_each_gt(gt, xe, id) + xe_gt_declare_wedged(gt); } diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 0ba2e2d0289b..31b2e64c70c6 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -904,3 +904,18 @@ struct xe_hw_engine *xe_gt_any_hw_engine(struct xe_gt *gt) return NULL; } + +/** + * xe_gt_declare_wedged() - Declare GT wedged + * @gt: the GT object + * + * Wedge the GT which stops all submission, saves desired debug state, and + * cleans up anything which could timeout. + */ +void xe_gt_declare_wedged(struct xe_gt *gt) +{ + xe_gt_assert(gt, gt_to_xe(gt)->wedged.mode); + + xe_uc_declare_wedged(>->uc); + xe_gt_tlb_invalidation_reset(gt); +} diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h index 1123fdfc4ebc..8b1a5027dcf2 100644 --- a/drivers/gpu/drm/xe/xe_gt.h +++ b/drivers/gpu/drm/xe/xe_gt.h @@ -37,6 +37,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile); int xe_gt_init_hwconfig(struct xe_gt *gt); int xe_gt_init_early(struct xe_gt *gt); int xe_gt_init(struct xe_gt *gt); +void xe_gt_declare_wedged(struct xe_gt *gt); int xe_gt_record_default_lrcs(struct xe_gt *gt); /** diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index eb655cee19f7..de0fe9e65746 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -1178,3 +1178,19 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p) xe_guc_ct_print(&guc->ct, p, false); xe_guc_submit_print(guc, p); } + +/** + * xe_guc_declare_wedged() - Declare GuC wedged + * @guc: the GuC object + * + * Wedge the GuC which stops all submission, saves desired debug state, and + * cleans up anything which could timeout. + */ +void xe_guc_declare_wedged(struct xe_guc *guc) +{ + xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode); + + xe_guc_reset_prepare(guc); + xe_guc_ct_stop(&guc->ct); + xe_guc_submit_wedge(guc); +} diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h index af59c9545753..e0bbf98f849d 100644 --- a/drivers/gpu/drm/xe/xe_guc.h +++ b/drivers/gpu/drm/xe/xe_guc.h @@ -37,6 +37,7 @@ void xe_guc_reset_wait(struct xe_guc *guc); void xe_guc_stop_prepare(struct xe_guc *guc); void xe_guc_stop(struct xe_guc *guc); int xe_guc_start(struct xe_guc *guc); +void xe_guc_declare_wedged(struct xe_guc *guc); static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class) { diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 373447758a60..8d7e7f4bbff7 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -861,29 +861,27 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q) xe_sched_tdr_queue_imm(&q->guc->sched); } -static bool guc_submit_hint_wedged(struct xe_guc *guc) +/** + * xe_guc_submit_wedge() - Wedge GuC submission + * @guc: the GuC object + * + * Save exec queue's registered with GuC state by taking a ref to each queue. + * Register a DRMM handler to drop refs upon driver unload. + */ +void xe_guc_submit_wedge(struct xe_guc *guc) { struct xe_device *xe = guc_to_xe(guc); struct xe_exec_queue *q; unsigned long index; int err; - if (xe->wedged.mode != 2) - return false; - - if (xe_device_wedged(xe)) - return true; - - xe_device_declare_wedged(xe); - - xe_guc_submit_reset_prepare(guc); - xe_guc_ct_stop(&guc->ct); + xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode); err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm, guc_submit_wedged_fini, guc); if (err) { drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n"); - return true; /* Device is wedged anyway */ + return; } mutex_lock(&guc->submission_state.lock); @@ -891,6 +889,19 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc) if (xe_exec_queue_get_unless_zero(q)) set_exec_queue_wedged(q); mutex_unlock(&guc->submission_state.lock); +} + +static bool guc_submit_hint_wedged(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + + if (xe->wedged.mode != 2) + return false; + + if (xe_device_wedged(xe)) + return true; + + xe_device_declare_wedged(xe); return true; } @@ -1677,7 +1688,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc) void xe_guc_submit_reset_wait(struct xe_guc *guc) { - wait_event(guc->ct.wq, !guc_read_stopped(guc)); + wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) || + !guc_read_stopped(guc)); } void xe_guc_submit_stop(struct xe_guc *guc) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h index 4ad5f4c1b084..bdf8c9f3d24a 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.h +++ b/drivers/gpu/drm/xe/xe_guc_submit.h @@ -18,6 +18,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc); void xe_guc_submit_reset_wait(struct xe_guc *guc); void xe_guc_submit_stop(struct xe_guc *guc); int xe_guc_submit_start(struct xe_guc *guc); +void xe_guc_submit_wedge(struct xe_guc *guc); int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len); int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len); diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c index 0f240534fb72..0d073a9987c2 100644 --- a/drivers/gpu/drm/xe/xe_uc.c +++ b/drivers/gpu/drm/xe/xe_uc.c @@ -300,3 +300,17 @@ void xe_uc_remove(struct xe_uc *uc) { xe_gsc_remove(&uc->gsc); } + +/** + * xe_uc_declare_wedged() - Declare UC wedged + * @uc: the UC object + * + * Wedge the UC which stops all submission, saves desired debug state, and + * cleans up anything which could timeout. + */ +void xe_uc_declare_wedged(struct xe_uc *uc) +{ + xe_gt_assert(uc_to_gt(uc), uc_to_xe(uc)->wedged.mode); + + xe_guc_declare_wedged(&uc->guc); +} diff --git a/drivers/gpu/drm/xe/xe_uc.h b/drivers/gpu/drm/xe/xe_uc.h index 11856f24e6f9..506517c11333 100644 --- a/drivers/gpu/drm/xe/xe_uc.h +++ b/drivers/gpu/drm/xe/xe_uc.h @@ -21,5 +21,6 @@ int xe_uc_start(struct xe_uc *uc); int xe_uc_suspend(struct xe_uc *uc); int xe_uc_sanitize_reset(struct xe_uc *uc); void xe_uc_remove(struct xe_uc *uc); +void xe_uc_declare_wedged(struct xe_uc *uc); #endif -- cgit From 90936a0a4c54f0a1cdf4538f9128821ad70c36ab Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 15 Jul 2024 23:39:02 -0700 Subject: drm/xe: Don't suspend device upon wedge When wedging a device we shouldn't be suspending device as state for debug will be lost. Also this appears to not work as the below stack trace pops upon trying to resume a wedged device: [ 304.245044] INFO: task cat:12115 blocked for more than 151 seconds. [ 304.251333] Tainted: G W 6.10.0-rc7-xe+ #3518 [ 304.257617] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 304.265459] task:cat state:D stack:13384 pid:12115 tgid:12115 ppid:3986 flags:0x00000006 [ 304.265465] Call Trace: [ 304.265467] [ 304.265469] __schedule+0x3c4/0xdf0 [ 304.265478] schedule+0x3c/0x140 [ 304.265481] rpm_resume+0x1cc/0x740 [ 304.265484] ? __pfx_autoremove_wake_function+0x10/0x10 [ 304.265489] __pm_runtime_resume+0x49/0x80 [ 304.265494] guc_info+0x6b/0xb0 [xe] [ 304.265538] ? __pfx___drm_printfn_seq_file+0x10/0x10 [ 304.265541] ? __pfx___drm_puts_seq_file+0x10/0x10 [ 304.265545] seq_read_iter+0x111/0x4c0 [ 304.265551] seq_read+0xfc/0x140 [ 304.265556] full_proxy_read+0x58/0x80 [ 304.265560] vfs_read+0xa7/0x360 [ 304.265563] ? find_held_lock+0x2b/0x80 [ 304.265568] ksys_read+0x64/0xe0 [ 304.265571] do_syscall_64+0x68/0x140 [ 304.265575] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 304.265578] RIP: 0033:0x7f4254d14992 [ 304.265580] RSP: 002b:00007ffc558666f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 304.265583] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f4254d14992 [ 304.265584] RDX: 0000000000020000 RSI: 00007f4254ebb000 RDI: 0000000000000003 [ 304.265586] RBP: 00007f4254ebb000 R08: 00007f4254eba010 R09: 00007f4254eba010 [ 304.265587] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000022000 [ 304.265588] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [ 304.265593] [ 304.265594] Showing all locks held in the system: [ 304.265598] 1 lock held by khungtaskd/57: [ 304.265599] #0: ffffffff8273b860 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x36/0x1c0 [ 304.265607] 3 locks held by kworker/6:1/90: [ 304.265610] 1 lock held by in:imklog/547: [ 304.265611] #0: ffff88810498cd88 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x76/0xc0 [ 304.265620] 1 lock held by dmesg/1310: v2: Drop local 'err' variable (Jonathan) Fixes: 8ed9aaae39f3 ("drm/xe: Force wedged state and block GT reset upon any GPU hang") Cc: Rodrigo Vivi Signed-off-by: Matthew Brost Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-2-matthew.brost@intel.com (cherry picked from commit 452bca0edbd0764ca0284239d5438b3edd305ab3) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 41548eff953d..76109415eba6 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -854,6 +854,13 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address) return address & GENMASK_ULL(xe->info.va_bits - 1, 0); } +static void xe_device_wedged_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + + xe_pm_runtime_put(xe); +} + /** * xe_device_declare_wedged - Declare device wedged * @xe: xe device instance @@ -878,6 +885,13 @@ void xe_device_declare_wedged(struct xe_device *xe) return; } + if (drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe)) { + drm_err(&xe->drm, "Failed to register xe_device_wedged_fini clean-up. Although device is wedged.\n"); + return; + } + + xe_pm_runtime_get_noresume(xe); + if (!atomic_xchg(&xe->wedged.flag, 1)) { xe->needs_flr_on_fini = true; drm_err(&xe->drm, -- cgit From a83b22754e351f13fb46596c85f667dc33da71ec Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Thu, 18 Jul 2024 13:55:34 +0200 Subject: clk: davinci: da8xx-cfgchip: Initialize clk_init_data before use The flag attribute of the struct clk_init_data isn't initialized before the devm_clk_hw_register() call. This can lead to unexpected behavior during registration. Initialize the entire clk_init_data to zero at declaration. Cc: stable@vger.kernel.org Fixes: 58e1e2d2cd89 ("clk: davinci: cfgchip: Add TI DA8XX USB PHY clocks") Signed-off-by: Bastien Curutchet Reviewed-by: David Lechner Link: https://lore.kernel.org/r/20240718115534.41513-1-bastien.curutchet@bootlin.com Signed-off-by: Stephen Boyd --- drivers/clk/davinci/da8xx-cfgchip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clk/davinci/da8xx-cfgchip.c b/drivers/clk/davinci/da8xx-cfgchip.c index ad2d0df43dc6..ec60ecb517f1 100644 --- a/drivers/clk/davinci/da8xx-cfgchip.c +++ b/drivers/clk/davinci/da8xx-cfgchip.c @@ -508,7 +508,7 @@ da8xx_cfgchip_register_usb0_clk48(struct device *dev, const char * const parent_names[] = { "usb_refclkin", "pll0_auxclk" }; struct clk *fck_clk; struct da8xx_usb0_clk48 *usb0; - struct clk_init_data init; + struct clk_init_data init = {}; int ret; fck_clk = devm_clk_get(dev, "fck"); @@ -583,7 +583,7 @@ da8xx_cfgchip_register_usb1_clk48(struct device *dev, { const char * const parent_names[] = { "usb0_clk48", "usb_refclkin" }; struct da8xx_usb1_clk48 *usb1; - struct clk_init_data init; + struct clk_init_data init = {}; int ret; usb1 = devm_kzalloc(dev, sizeof(*usb1), GFP_KERNEL); -- cgit From 5a6a25ea5bcd5bdf80fb13acd65a03fc6b8794b1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Jul 2024 21:25:53 -0500 Subject: clk: sophgo: clk-sg2042-pll: Fix uninitialized variable in debug output If sg2042_get_pll_ctl_setting() fails then "value" isn't initialized and it is printed in the debug output. Initialize it to zero. Fixes: 48cf7e01386e ("clk: sophgo: Add SG2042 clock driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/baf0a490-d5ba-4528-90ba-80399684692d@stanley.mountain Reviewed-by: Chen Wang Signed-off-by: Stephen Boyd --- drivers/clk/sophgo/clk-sg2042-pll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sophgo/clk-sg2042-pll.c b/drivers/clk/sophgo/clk-sg2042-pll.c index 9695e64fc23b..ff9deeef509b 100644 --- a/drivers/clk/sophgo/clk-sg2042-pll.c +++ b/drivers/clk/sophgo/clk-sg2042-pll.c @@ -387,7 +387,7 @@ static int sg2042_clk_pll_set_rate(struct clk_hw *hw, struct sg2042_pll_clock *pll = to_sg2042_pll_clk(hw); struct sg2042_pll_ctrl pctrl_table; unsigned long flags; - u32 value; + u32 value = 0; int ret; spin_lock_irqsave(pll->lock, flags); -- cgit From 74dba240881820b46b9b1c62ef4de3bfff47fbd4 Mon Sep 17 00:00:00 2001 From: wangdicheng Date: Fri, 19 Jul 2024 10:09:06 +0800 Subject: ALSA: usb-audio: Fix microphone sound on HD webcam. I own an external usb Webcam, HD webcam, which had low mic volume and inconsistent sound quality. Video works as expected. (snip) [ 95.473820][ 1] [ T73] usb 5-2.2: new high-speed USB device number 7 using xhci_hcd [ 95.773974][ 1] [ T73] usb 5-2.2: New USB device found, idVendor=1bcf, idProduct=2281, bcdDevice= 0.05 [ 95.783445][ 1] [ T73] usb 5-2.2: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 95.791872][ 1] [ T73] usb 5-2.2: Product: HD webcam [ 95.797001][ 1] [ T73] usb 5-2.2: Manufacturer: Sunplus IT Co [ 95.802996][ 1] [ T73] usb 5-2.2: SerialNumber: 20200513 [ 96.092610][ 2] [ T3680] usb 5-2.2: Warning! Unlikely big volume range (=4096), cval->res is probably wrong. [ 96.102436][ 2] [ T3680] usb 5-2.2: [5] FU [Mic Capture Volume] ch = 1, val = 0/4096/1 Set up quirk cval->res to 16 for 256 levels, Set GET_SAMPLE_RATE quirk flag to stop trying to get the sample rate. Confirmed that happened anyway later due to the backoff mechanism, After 3 failures. All audio stream on device interfaces share the same values, apart from wMaxPacketSize and tSamFreq : bLength 9 bDescriptorType 4 bInterfaceNumber 3 bAlternateSetting 4 bNumEndpoints 1 bInterfaceClass 1 Audio Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 3 bAlternateSetting 4 bNumEndpoints 1 bInterfaceClass 1 Audio bInterfaceSubClass 2 Streaming bInterfaceProtocol 0 iInterface 0 AudioStreaming Interface Descriptor: bLength 7 bDescriptorType 36 bDescriptorSubtype 1 (AS_GENERAL) bTerminalLink 3 bDelay 1 frames wFormatTag 0x0001 PCM AudioStreaming Interface Descriptor: bLength 11 bDescriptorType 36 bDescriptorSubtype 2 (FORMAT_TYPE) bFormatType 1 (FORMAT_TYPE_I) bNrChannels 1 bSubframeSize 2 bBitResolution 16 bSamFreqType 1 Discrete tSamFreq[ 0] 48000 Endpoint Descriptor: bLength 9 bDescriptorType 5 bEndpointAddress 0x86 EP 6 IN bmAttributes 5 Transfer Type Isochronous Synch Type Asynchronous Usage Type Data wMaxPacketSize 0x0064 1x 100 bytes bInterval 4 bRefresh 0 bSynchAddress 0 AudioStreaming Endpoint Descriptor: bLength 7 bDescriptorType 37 bDescriptorSubtype 1 (EP_GENERAL) bmAttributes 0x01 Sampling Frequency bLockDelayUnits 0 Undefined wLockDelay 0x0000 (snip) Testing patch provides consistent good sound recording quality and volume range. (snip) [ 95.473820][ 1] [ T73] usb 5-2.2: new high-speed USB device number 7 using xhci_hcd [ 95.773974][ 1] [ T73] usb 5-2.2: New USB device found, idVendor=1bcf, idProduct=2281, bcdDevice= 0.05 [ 95.783445][ 1] [ T73] usb 5-2.2: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 95.791872][ 1] [ T73] usb 5-2.2: Product: HD webcam [ 95.797001][ 1] [ T73] usb 5-2.2: Manufacturer: Sunplus IT Co [ 95.802996][ 1] [ T73] usb 5-2.2: SerialNumber: 20200513 [ 96.110630][ 3] [ T3680] usbcore: registered new interface driver snd-usb-audio [ 96.114329][ 7] [ T3677] usb 5-2.2: Found UVC 1.00 device HD webcam (1bcf:2281) [ 96.167555][ 7] [ T3677] usbcore: registered new interface driver uvcvideo Signed-off-by: wangdicheng Cc: Link: https://patch.msgid.link/20240719020906.8078-1-wangdich9700@163.com Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 7 +++++++ sound/usb/quirks.c | 2 ++ 2 files changed, 9 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index c00009b545c0..f7ce8e8c3c3e 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1211,6 +1211,13 @@ static void volume_control_quirks(struct usb_mixer_elem_info *cval, cval->res = 16; } break; + case USB_ID(0x1bcf, 0x2281): /* HD Webcam */ + if (!strcmp(kctl->id.name, "Mic Capture Volume")) { + usb_audio_info(chip, + "set resolution quirk: cval->res = 16\n"); + cval->res = 16; + } + break; } } diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 58156fbca02c..19c1fc9504c3 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2225,6 +2225,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x534d, 0x2109, /* MacroSilicon MS2109 */ QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */ + QUIRK_FLAG_GET_SAMPLE_RATE), /* Vendor matches */ VENDOR_FLG(0x045e, /* MS Lifecam */ -- cgit From 3ae08e47742eeebf2190900d31ddac53fdd13a5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 16:44:10 +0200 Subject: gpio: virtuser: avoid non-constant format string Using a string variable as an sprintf format is potentially dangerous, and gcc can warn about this: drivers/gpio/gpio-virtuser.c: In function 'gpio_virtuser_dbgfs_init_line_attrs': drivers/gpio/gpio-virtuser.c:808:9: error: format not a string literal and no format arguments [-Werror=format-security] 808 | sprintf(data->consumer, id); | ^~~~~~~ Change it to a simpler strscpy() instead to just copy it and check the destination buffer size. Fixes: 91581c4b3f29 ("gpio: virtuser: new virtual testing driver for the GPIO API") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240719144422.2082394-1-arnd@kernel.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-virtuser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c index 0e0d55da4f01..ccc47ea0b3e1 100644 --- a/drivers/gpio/gpio-virtuser.c +++ b/drivers/gpio/gpio-virtuser.c @@ -805,7 +805,7 @@ static int gpio_virtuser_dbgfs_init_line_attrs(struct device *dev, return -ENOMEM; data->ad.desc = desc; - sprintf(data->consumer, id); + strscpy(data->consumer, id); atomic_set(&data->irq, 0); atomic_set(&data->irq_count, 0); -- cgit From a2d6d8aee4a4f694920bfaf6f6be3690e0f8d302 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 11:56:34 +0200 Subject: ALSA: hda: tas2781: mark const variables as __maybe_unused An earlier patch changed the DECLARE_TLV_DB_SCALE declaration, but now there are additional static const variables that cause the same build warnings: In file included from sound/pci/hda/tas2781_hda_i2c.c:23: include/sound/tas2781-tlv.h:23:28: error: 'tas2563_dvc_table' defined but not used [-Werror=unused-const-variable=] 23 | static const unsigned char tas2563_dvc_table[][4] = { | ^~~~~~~~~~~~~~~~~ In file included from include/sound/tlv.h:10, from sound/pci/hda/tas2781_hda_i2c.c:22: include/sound/tas2781-tlv.h:20:35: error: 'tas2563_dvc_tlv' defined but not used [-Werror=unused-const-variable=] 20 | static const DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1); | ^~~~~~~~~~~~~~~ Mark them all as unused as well. Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/20240719095640.3741247-1-arnd@kernel.org Signed-off-by: Takashi Iwai --- include/sound/tas2781-tlv.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h index 99c41bfc7827..00fd4d449ff3 100644 --- a/include/sound/tas2781-tlv.h +++ b/include/sound/tas2781-tlv.h @@ -16,11 +16,11 @@ #define __TAS2781_TLV_H__ static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); -static const DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0); -static const DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1); +static const __maybe_unused DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0); +static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1); /* pow(10, db/20) * pow(2,30) */ -static const unsigned char tas2563_dvc_table[][4] = { +static const __maybe_unused unsigned char tas2563_dvc_table[][4] = { { 0X00, 0X00, 0X00, 0X00 }, /* -121.5db */ { 0X00, 0X00, 0X03, 0XBC }, /* -121.0db */ { 0X00, 0X00, 0X03, 0XF5 }, /* -120.5db */ -- cgit From 7010d9464f7ca3ee2d75095ea2e642a9009a41ff Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 Jul 2024 10:06:04 +0200 Subject: ALSA: usb-audio: Move HD Webcam quirk to the right place The quirk_flags_table[] is sorted in the USB ID order, while the last fix was put at a wrong position. Adjust the entry at the right position. Fixes: 74dba2408818 ("ALSA: usb-audio: Fix microphone sound on HD webcam.") Cc: Link: https://patch.msgid.link/20240722080605.23481-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 19c1fc9504c3..f070ecf75ce8 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2167,6 +2167,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x19f7, 0x0035, /* RODE NT-USB+ */ QUIRK_FLAG_GET_SAMPLE_RATE), + DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x1bcf, 0x2283, /* NexiGo N930AF FHD Webcam */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x2040, 0x7200, /* Hauppauge HVR-950Q */ @@ -2225,8 +2227,6 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x534d, 0x2109, /* MacroSilicon MS2109 */ QUIRK_FLAG_ALIGN_TRANSFER), - DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */ - QUIRK_FLAG_GET_SAMPLE_RATE), /* Vendor matches */ VENDOR_FLG(0x045e, /* MS Lifecam */ -- cgit From 542440fd7b30983cae23e32bd22f69a076ec7ef4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 12:40:24 +0200 Subject: regmap: maple: work around gcc-14.1 false-positive warning With gcc-14.1, there is a false-postive -Wuninitialized warning in regcache_maple_drop: drivers/base/regmap/regcache-maple.c: In function 'regcache_maple_drop': drivers/base/regmap/regcache-maple.c:113:23: error: 'lower_index' is used uninitialized [-Werror=uninitialized] 113 | unsigned long lower_index, lower_last; | ^~~~~~~~~~~ drivers/base/regmap/regcache-maple.c:113:36: error: 'lower_last' is used uninitialized [-Werror=uninitialized] 113 | unsigned long lower_index, lower_last; | ^~~~~~~~~~ I've created a reduced test case to see if this needs to be reported as a gcc, but it appears that the gcc-14.x branch already has a change that turns this into a more sensible -Wmaybe-uninitialized warning, so I ended up not reporting it so far. The reduced test case also produces a warning for gcc-13 and gcc-12 but I don't see that with the version in the kernel. Link: https://godbolt.org/z/oKbohKqd3 Link: https://lore.kernel.org/all/CAMuHMdWj=FLmkazPbYKPevDrcym2_HDb_U7Mb9YE9ovrP0jJfA@mail.gmail.com/ Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/20240719104030.1382465-1-arnd@kernel.org Signed-off-by: Mark Brown --- drivers/base/regmap/regcache-maple.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c index f0df2da6d522..2dea9d259c49 100644 --- a/drivers/base/regmap/regcache-maple.c +++ b/drivers/base/regmap/regcache-maple.c @@ -110,7 +110,8 @@ static int regcache_maple_drop(struct regmap *map, unsigned int min, struct maple_tree *mt = map->cache; MA_STATE(mas, mt, min, max); unsigned long *entry, *lower, *upper; - unsigned long lower_index, lower_last; + /* initialized to work around false-positive -Wuninitialized warning */ + unsigned long lower_index = 0, lower_last = 0; unsigned long upper_index, upper_last; int ret = 0; -- cgit From 9931f7d5d251882a147cc5811060097df43e79f5 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 22 Jul 2024 10:30:02 +0200 Subject: ASoC: Intel: use soc_intel_is_byt_cr() only when IOSF_MBI is reachable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the Intel kbuild bot reports a link failure when IOSF_MBI is built-in but the Merrifield driver is configured as a module. The soc-intel-quirks.h is included for Merrifield platforms, but IOSF_MBI is not selected for that platform. ld.lld: error: undefined symbol: iosf_mbi_read >>> referenced by atom.c >>> sound/soc/sof/intel/atom.o:(atom_machine_select) in archive vmlinux.a This patch forces the use of the fallback static inline when IOSF_MBI is not reachable. Fixes: 536cfd2f375d ("ASoC: Intel: use common helpers to detect CPUs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407160704.zpdhJ8da-lkp@intel.com/ Suggested-by: Takashi Iwai Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Bard Liao Link: https://patch.msgid.link/20240722083002.10800-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-intel-quirks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/common/soc-intel-quirks.h b/sound/soc/intel/common/soc-intel-quirks.h index de4e550c5b34..42bd51456b94 100644 --- a/sound/soc/intel/common/soc-intel-quirks.h +++ b/sound/soc/intel/common/soc-intel-quirks.h @@ -11,7 +11,7 @@ #include -#if IS_ENABLED(CONFIG_X86) +#if IS_REACHABLE(CONFIG_IOSF_MBI) #include #include -- cgit From 83340b855d222f257354afd272dc8d315fecc3ee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 09:48:03 +0200 Subject: ASoC: tegra: select CONFIG_SND_SIMPLE_CARD_UTILS This I2S client driver now uses functions exported from a helper module but fails to link when the helper is disabled: ERROR: modpost: "simple_util_parse_convert" [sound/soc/tegra/snd-soc-tegra210-i2s.ko] undefined! ERROR: modpost: "simple_util_get_sample_fmt" [sound/soc/tegra/snd-soc-tegra210-i2s.ko] undefined! Add a Kconfig select line to ensure it's always turned on here. Fixes: 2502f8dd8c30 ("ASoC: tegra: I2S client convert formats handling") Signed-off-by: Arnd Bergmann Reviewed-by: Sameer Pujar Link: https://patch.msgid.link/20240719074831.3253995-1-arnd@kernel.org Signed-off-by: Mark Brown --- sound/soc/tegra/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/tegra/Kconfig b/sound/soc/tegra/Kconfig index 74effc57a7a0..2463c22e9cf6 100644 --- a/sound/soc/tegra/Kconfig +++ b/sound/soc/tegra/Kconfig @@ -78,6 +78,7 @@ config SND_SOC_TEGRA210_DMIC config SND_SOC_TEGRA210_I2S tristate "Tegra210 I2S module" + select SND_SIMPLE_CARD_UTILS help Config to enable the Inter-IC Sound (I2S) Controller which implements full-duplex and bidirectional and single direction -- cgit From 92c78222168e9035a9bfb8841c2e56ce23e51f73 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 Jul 2024 18:53:48 -0500 Subject: ASoC: TAS2781: Fix tasdev_load_calibrated_data() This function has a reversed if statement so it's either a no-op or it leads to a NULL dereference. Fixes: b195acf5266d ("ASoC: tas2781: Fix wrong loading calibrated data sequence") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/18a29b68-cc85-4139-b7c7-2514e8409a42@stanley.mountain Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-fmwlib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2781-fmwlib.c b/sound/soc/codecs/tas2781-fmwlib.c index 63626b982d04..8f9a3ae7153e 100644 --- a/sound/soc/codecs/tas2781-fmwlib.c +++ b/sound/soc/codecs/tas2781-fmwlib.c @@ -2162,7 +2162,7 @@ static void tasdev_load_calibrated_data(struct tasdevice_priv *priv, int i) return; cal = cal_fmw->calibrations; - if (cal) + if (!cal) return; load_calib_data(priv, &cal->dev_data); -- cgit From 21451dfd853e7d8e6e3fbd7ef1fbdb2f2ead12f5 Mon Sep 17 00:00:00 2001 From: wangdicheng Date: Mon, 22 Jul 2024 16:48:22 +0800 Subject: ALSA: usb-audio: Add a quirk for Sonix HD USB Camera Sonix HD USB Camera does not support reading the sample rate which leads to many lines of "cannot get freq at ep 0x84". This patch adds the USB ID to quirks.c and avoids those error messages. (snip) [1.789698] usb 3-3: new high-speed USB device number 2 using xhci_hcd [1.984121] usb 3-3: New USB device found, idVendor=0c45, idProduct=6340, bcdDevice= 0.00 [1.984124] usb 3-3: New USB device strings: Mfr=2, Product=1, SerialNumber=0 [1.984127] usb 3-3: Product: USB 2.0 Camera [1.984128] usb 3-3: Manufacturer: Sonix Technology Co., Ltd. [5.440957] usb 3-3: 3:1: cannot get freq at ep 0x84 [12.130679] usb 3-3: 3:1: cannot get freq at ep 0x84 [12.175065] usb 3-3: 3:1: cannot get freq at ep 0x84 Signed-off-by: wangdicheng Cc: Link: https://patch.msgid.link/20240722084822.31620-1-wangdich9700@163.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index f070ecf75ce8..ea063a14cdd8 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2125,6 +2125,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0b0e, 0x0349, /* Jabra 550a */ QUIRK_FLAG_CTL_MSG_DELAY_1M), + DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */ -- cgit From eabd9db64ea8ba64d2a0b1d70da38e1a95dcd08b Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Thu, 13 Jun 2024 16:54:33 +0800 Subject: ACPI: RISCV: Add NUMA support based on SRAT and SLIT Add acpi_numa.c file to enable parse NUMA information from ACPI SRAT and SLIT tables. SRAT table provide CPUs(Hart) and memory nodes to proximity domain mapping, while SLIT table provide the distance metrics between proximity domains. Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/65dbad1fda08a32922c44886e4581e49b4a2fecc.1718268003.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/acpi.h | 15 ++++- arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/acpi.c | 5 -- arch/riscv/kernel/acpi_numa.c | 131 ++++++++++++++++++++++++++++++++++++++++++ arch/riscv/kernel/setup.c | 4 +- arch/riscv/kernel/smpboot.c | 2 - include/linux/acpi.h | 6 ++ 7 files changed, 154 insertions(+), 10 deletions(-) create mode 100644 arch/riscv/kernel/acpi_numa.c diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h index 7dad0cf9d701..e0a1f84404f3 100644 --- a/arch/riscv/include/asm/acpi.h +++ b/arch/riscv/include/asm/acpi.h @@ -61,11 +61,14 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) { } void acpi_init_rintc_map(void); struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu); -u32 get_acpi_id_for_cpu(int cpu); +static inline u32 get_acpi_id_for_cpu(int cpu) +{ + return acpi_cpu_get_madt_rintc(cpu)->uid; +} + int acpi_get_riscv_isa(struct acpi_table_header *table, unsigned int cpu, const char **isa); -static inline int acpi_numa_get_nid(unsigned int cpu) { return NUMA_NO_NODE; } void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size, u32 *cboz_size, u32 *cbop_size); #else @@ -87,4 +90,12 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table, #endif /* CONFIG_ACPI */ +#ifdef CONFIG_ACPI_NUMA +int acpi_numa_get_nid(unsigned int cpu); +void acpi_map_cpus_to_nodes(void); +#else +static inline int acpi_numa_get_nid(unsigned int cpu) { return NUMA_NO_NODE; } +static inline void acpi_map_cpus_to_nodes(void) { } +#endif /* CONFIG_ACPI_NUMA */ + #endif /*_ASM_ACPI_H*/ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 5b243d46f4b1..1e2afec141b5 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -110,3 +110,4 @@ obj-$(CONFIG_COMPAT) += compat_vdso/ obj-$(CONFIG_64BIT) += pi/ obj-$(CONFIG_ACPI) += acpi.o +obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index e619edc8b0cc..040bdbfea2b4 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -191,11 +191,6 @@ struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu) return &cpu_madt_rintc[cpu]; } -u32 get_acpi_id_for_cpu(int cpu) -{ - return acpi_cpu_get_madt_rintc(cpu)->uid; -} - /* * __acpi_map_table() will be called before paging_init(), so early_ioremap() * or early_memremap() should be called here to for ACPI table mapping. diff --git a/arch/riscv/kernel/acpi_numa.c b/arch/riscv/kernel/acpi_numa.c new file mode 100644 index 000000000000..0231482d6946 --- /dev/null +++ b/arch/riscv/kernel/acpi_numa.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACPI 6.6 based NUMA setup for RISCV + * Lots of code was borrowed from arch/arm64/kernel/acpi_numa.c + * + * Copyright 2004 Andi Kleen, SuSE Labs. + * Copyright (C) 2013-2016, Linaro Ltd. + * Author: Hanjun Guo + * Copyright (C) 2024 Intel Corporation. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#define pr_fmt(fmt) "ACPI: NUMA: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; + +int __init acpi_numa_get_nid(unsigned int cpu) +{ + return acpi_early_node_map[cpu]; +} + +static inline int get_cpu_for_acpi_id(u32 uid) +{ + int cpu; + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + if (uid == get_acpi_id_for_cpu(cpu)) + return cpu; + + return -EINVAL; +} + +static int __init acpi_parse_rintc_pxm(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_rintc_affinity *pa; + int cpu, pxm, node; + + if (srat_disabled()) + return -EINVAL; + + pa = (struct acpi_srat_rintc_affinity *)header; + if (!pa) + return -EINVAL; + + if (!(pa->flags & ACPI_SRAT_RINTC_ENABLED)) + return 0; + + pxm = pa->proximity_domain; + node = pxm_to_node(pxm); + + /* + * If we can't map the UID to a logical cpu this + * means that the UID is not part of possible cpus + * so we do not need a NUMA mapping for it, skip + * the SRAT entry and keep parsing. + */ + cpu = get_cpu_for_acpi_id(pa->acpi_processor_uid); + if (cpu < 0) + return 0; + + acpi_early_node_map[cpu] = node; + pr_info("SRAT: PXM %d -> HARTID 0x%lx -> Node %d\n", pxm, + cpuid_to_hartid_map(cpu), node); + + return 0; +} + +void __init acpi_map_cpus_to_nodes(void) +{ + int i; + + /* + * In ACPI, SMP and CPU NUMA information is provided in separate + * static tables, namely the MADT and the SRAT. + * + * Thus, it is simpler to first create the cpu logical map through + * an MADT walk and then map the logical cpus to their node ids + * as separate steps. + */ + acpi_table_parse_entries(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_RINTC_AFFINITY, acpi_parse_rintc_pxm, 0); + + for (i = 0; i < nr_cpu_ids; i++) + early_map_cpu_to_node(i, acpi_numa_get_nid(i)); +} + +/* Callback for Proximity Domain -> logical node ID mapping */ +void __init acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) +{ + int pxm, node; + + if (srat_disabled()) + return; + + if (pa->header.length < sizeof(struct acpi_srat_rintc_affinity)) { + pr_err("SRAT: Invalid SRAT header length: %d\n", pa->header.length); + bad_srat(); + return; + } + + if (!(pa->flags & ACPI_SRAT_RINTC_ENABLED)) + return; + + pxm = pa->proximity_domain; + node = acpi_map_pxm_to_node(pxm); + + if (node == NUMA_NO_NODE) { + pr_err("SRAT: Too many proximity domains %d\n", pxm); + bad_srat(); + return; + } + + node_set(node, numa_nodes_parsed); +} diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 4f73c0ae44b2..a2cde65b69e9 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -281,8 +281,10 @@ void __init setup_arch(char **cmdline_p) setup_smp(); #endif - if (!acpi_disabled) + if (!acpi_disabled) { acpi_init_rintc_map(); + acpi_map_cpus_to_nodes(); + } riscv_init_cbo_blocksizes(); riscv_fill_hwcap(); diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 1319b29ce3b5..d49c0bf0e2cd 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -96,7 +96,6 @@ static int __init acpi_parse_rintc(union acpi_subtable_headers *header, const un if (hart == cpuid_to_hartid_map(0)) { BUG_ON(found_boot_cpu); found_boot_cpu = true; - early_map_cpu_to_node(0, acpi_numa_get_nid(cpu_count)); return 0; } @@ -106,7 +105,6 @@ static int __init acpi_parse_rintc(union acpi_subtable_headers *header, const un } cpuid_to_hartid_map(cpu_count) = hart; - early_map_cpu_to_node(cpu_count, acpi_numa_get_nid(cpu_count)); cpu_count++; return 0; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 28c3fb2bef0d..5a2b620ccd58 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -264,6 +264,12 @@ static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } #endif +#ifdef CONFIG_RISCV +void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa); +#else +static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { } +#endif + #ifndef PHYS_CPUID_INVALID typedef u32 phys_cpuid_t; #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) -- cgit From 39494aec8a0454f9e8a68d2422d3bd4e5c5ee0c7 Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Thu, 13 Jun 2024 16:54:34 +0800 Subject: ACPI: NUMA: Add handler for SRAT RINTC affinity structure Add RINTC affinity structure handler during parsing SRAT table. Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/e076514d78d92f104a5f2d8c82b8921f6aa26fdd.1718268003.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- drivers/acpi/numa/srat.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index e3f26e71637a..44f91f2c6c5d 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -167,6 +167,19 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) } } break; + + case ACPI_SRAT_TYPE_RINTC_AFFINITY: + { + struct acpi_srat_rintc_affinity *p = + (struct acpi_srat_rintc_affinity *)header; + pr_debug("SRAT Processor (acpi id[0x%04x]) in proximity domain %d %s\n", + p->acpi_processor_uid, + p->proximity_domain, + (p->flags & ACPI_SRAT_RINTC_ENABLED) ? + "enabled" : "disabled"); + } + break; + default: pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", header->type); @@ -450,6 +463,21 @@ acpi_parse_gi_affinity(union acpi_subtable_headers *header, } #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ +static int __init +acpi_parse_rintc_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_rintc_affinity *rintc_affinity; + + rintc_affinity = (struct acpi_srat_rintc_affinity *)header; + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_rintc_affinity_init(rintc_affinity); + + return 0; +} + static int __init acpi_parse_srat(struct acpi_table_header *table) { struct acpi_table_srat *srat = (struct acpi_table_srat *)table; @@ -485,7 +513,7 @@ int __init acpi_numa_init(void) /* SRAT: System Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { - struct acpi_subtable_proc srat_proc[4]; + struct acpi_subtable_proc srat_proc[5]; memset(srat_proc, 0, sizeof(srat_proc)); srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; @@ -496,6 +524,8 @@ int __init acpi_numa_init(void) srat_proc[2].handler = acpi_parse_gicc_affinity; srat_proc[3].id = ACPI_SRAT_TYPE_GENERIC_AFFINITY; srat_proc[3].handler = acpi_parse_gi_affinity; + srat_proc[4].id = ACPI_SRAT_TYPE_RINTC_AFFINITY; + srat_proc[4].handler = acpi_parse_rintc_affinity; acpi_table_parse_entries_array(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), -- cgit From adc3e82d253786a5b63c821108709de23317a90d Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Thu, 13 Jun 2024 16:54:35 +0800 Subject: ACPI: NUMA: change the ACPI_NUMA to a hidden option x86/arm64/loongarch would select ACPI_NUMA by default and riscv would do the same thing, so change it to a hidden option and the select statements except for the X86_64_ACPI_NUMA can also go away. Suggested-by: Arnd Bergmann Suggested-by: Sunil V L Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Acked-by: Huacai Chen Acked-by: Will Deacon Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/f1f96377b8ecd6e3183f28abf5c9ac21cb9855ea.1718268003.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- arch/arm64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - drivers/acpi/numa/Kconfig | 5 +---- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5d91259ee7b5..5079ad4e21a5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1484,7 +1484,6 @@ config HOTPLUG_CPU config NUMA bool "NUMA Memory Allocation and Scheduler Support" select GENERIC_ARCH_NUMA - select ACPI_NUMA if ACPI select OF_NUMA select HAVE_SETUP_PER_CPU_AREA select NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e38139c576ee..8d9e06e4ad84 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -470,7 +470,6 @@ config NR_CPUS config NUMA bool "NUMA Support" select SMP - select ACPI_NUMA if ACPI help Say Y to compile the kernel with NUMA (Non-Uniform Memory Access) support. This option improves performance on systems with more diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig index 849c2bd820b9..f33194d1e43f 100644 --- a/drivers/acpi/numa/Kconfig +++ b/drivers/acpi/numa/Kconfig @@ -1,9 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 config ACPI_NUMA - bool "NUMA support" - depends on NUMA - depends on (X86 || ARM64 || LOONGARCH) - default y if ARM64 + def_bool NUMA && !X86 config ACPI_HMAT bool "ACPI Heterogeneous Memory Attribute Table Support" -- cgit From 5f76d4211ee44dcd9b2dc8a9065002bf8ecee81e Mon Sep 17 00:00:00 2001 From: Haibo Xu Date: Thu, 13 Jun 2024 16:54:36 +0800 Subject: ACPI: NUMA: replace pr_info with pr_debug in arch_acpi_numa_init There are lots of ACPI enabled systems that aren't NUMA and If the firmware didn't provide the SRAT/SLIT, then there will be a message "Failed to initialise from firmware" from arch_acpi_numa_init() which adding noise to the boot on all of those kind of systems. Replace the pr_info with pr_debug in arch_acpi_numa_init() to avoid it. Suggested-by: Sunil V L Signed-off-by: Haibo Xu Reviewed-by: Sunil V L Reviewed-by: Hanjun Guo Link: https://lore.kernel.org/r/109354315a02cd22145d2effa4a8c571b69d3e56.1718268003.git.haibo1.xu@intel.com Signed-off-by: Palmer Dabbelt --- drivers/base/arch_numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 5b59d133b6af..555aee3ee8e7 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -445,7 +445,7 @@ static int __init arch_acpi_numa_init(void) ret = acpi_numa_init(); if (ret) { - pr_info("Failed to initialise from firmware\n"); + pr_debug("Failed to initialise from firmware\n"); return ret; } -- cgit From 9a4ab167cfb1dea1df0c0c948205a62c7eb3b85b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 Jul 2024 15:59:28 +0200 Subject: ALSA: ump: Don't update FB name for static blocks When a device tries to update the FB name string even if its Endpoint is declared as static, we should skip it, just already done for the FB info update reply. Fixes: 37e0e14128e0 ("ALSA: ump: Support UMP Endpoint and Function Block parsing") Cc: Link: https://patch.msgid.link/20240722135929.8612-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/core/ump.c b/sound/core/ump.c index 3f61220c23b4..b325fcfa77d0 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -806,6 +806,13 @@ static int ump_handle_fb_name_msg(struct snd_ump_endpoint *ump, if (!fb) return -ENODEV; + if (ump->parsed && + (ump->info.flags & SNDRV_UMP_EP_INFO_STATIC_BLOCKS)) { + ump_dbg(ump, "Skipping static FB name update (blk#%d)\n", + fb->info.block_id); + return 0; + } + ret = ump_append_string(ump, fb->info.name, sizeof(fb->info.name), buf->raw, 3); /* notify the FB name update to sequencer, too */ -- cgit From ac29d8ae05b770ed3f52d7a60908ab9b126f69d7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 22 Jul 2024 16:06:06 +0200 Subject: ALSA: ump: Force 1 Group for MIDI1 FBs When a Function Block declares it being a legacy MIDI1 device, it has to be only with a single UMP Group. Correct the attribute when a device declares it wrongly. Fixes: 37e0e14128e0 ("ALSA: ump: Support UMP Endpoint and Function Block parsing") Cc: Link: https://patch.msgid.link/20240722140610.10845-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/core/ump.c b/sound/core/ump.c index b325fcfa77d0..0f0d7e895c5a 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -733,6 +733,12 @@ static void fill_fb_info(struct snd_ump_endpoint *ump, info->block_id, info->direction, info->active, info->first_group, info->num_groups, info->midi_ci_version, info->sysex8_streams, info->flags); + + if ((info->flags & SNDRV_UMP_BLOCK_IS_MIDI1) && info->num_groups != 1) { + info->num_groups = 1; + ump_dbg(ump, "FB %d: corrected groups to 1 for MIDI1\n", + info->block_id); + } } /* check whether the FB info gets updated by the current message */ -- cgit From bacc15e010fc5a235fb2020b06a29a9961b5db82 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 19 Jul 2024 11:51:07 +0200 Subject: hid: bpf: add BPF_JIT dependency The module does not do anything when the JIT is disabled, but instead causes a warning: In file included from include/linux/bpf_verifier.h:7, from drivers/hid/bpf/hid_bpf_struct_ops.c:10: drivers/hid/bpf/hid_bpf_struct_ops.c: In function 'hid_bpf_struct_ops_init': include/linux/bpf.h:1853:50: error: statement with no effect [-Werror=unused-value] 1853 | #define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; }) | ^~~~~~~~~~~~~~~~ drivers/hid/bpf/hid_bpf_struct_ops.c:305:16: note: in expansion of macro 'register_bpf_struct_ops' 305 | return register_bpf_struct_ops(&bpf_hid_bpf_ops, hid_bpf_ops); | ^~~~~~~~~~~~~~~~~~~~~~~ Add a Kconfig dependency to only allow building the HID-BPF support when a JIT is enabled. Fixes: ebc0d8093e8c ("HID: bpf: implement HID-BPF through bpf_struct_ops") Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/96a00b6f-eb81-4c67-8c4b-6b1f3f045034@app.fastmail.com Signed-off-by: Benjamin Tissoires --- drivers/hid/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/bpf/Kconfig b/drivers/hid/bpf/Kconfig index 83214bae6768..d65482e02a6c 100644 --- a/drivers/hid/bpf/Kconfig +++ b/drivers/hid/bpf/Kconfig @@ -3,7 +3,7 @@ menu "HID-BPF support" config HID_BPF bool "HID-BPF support" - depends on BPF + depends on BPF_JIT depends on BPF_SYSCALL depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS help -- cgit From b6f7d984ebf826069d3dc6fa187b4d1cfb90f965 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Mon, 15 Jul 2024 14:15:37 +0200 Subject: dt-bindings: display: panel: samsung,atna33xc20: Document ATNA45AF01 The Samsung ATNA45AF01 panel is an AMOLED eDP panel that has backlight control over the DP AUX channel. While it works almost correctly with the generic "edp-panel" compatible, the backlight needs special handling to work correctly. It is similar to the existing ATNA33XC20 panel, just with a larger resolution and size. Add a new "samsung,atna45af01" compatible to describe this panel in the DT. Use the existing "samsung,atna33xc20" as fallback compatible since existing drivers should work as-is, given that resolution and size are discoverable through the eDP link. Signed-off-by: Stephan Gerhold Acked-by: Conor Dooley Reviewed-by: Neil Armstrong Reviewed-by: Douglas Anderson Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20240715-x1e80100-crd-backlight-v2-1-31b7f2f658a3@linaro.org --- .../devicetree/bindings/display/panel/samsung,atna33xc20.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml index 765ca155c83a..5192c93fbd67 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml @@ -14,7 +14,13 @@ allOf: properties: compatible: - const: samsung,atna33xc20 + oneOf: + # Samsung 13.3" FHD (1920x1080 pixels) eDP AMOLED panel + - const: samsung,atna33xc20 + # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel + - items: + - const: samsung,atna45af01 + - const: samsung,atna33xc20 enable-gpios: true port: true -- cgit From facd40aa5c4699f94014012e4e58414c082f2c01 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:19 +0200 Subject: timers/migration: Do not rely always on group->parent When reading group->parent without holding the group lock it is racy against CPUs coming online the first time and thereby creating another level of the hierarchy. This is not a problem when this value is read once to decide whether to abort a propagation or not. The worst outcome is an unnecessary/early CPU wake up. But it is racy when reading it several times during a single 'action' (like activation, deactivation, checking for remote timer expiry,...) and relying on the consitency of this value without holding the lock. This happens at the moment e.g. in tmigr_inactive_up() which is also calling tmigr_udpate_events(). Code relys on group->parent not to change during this 'action'. Update parent struct member description to explain the above only once. Remove parent pointer checks when they are not mandatory (like update of data->childmask). Remove a warning, which would be nice but the trigger of this warning is not reliable and add expand the data structure member description instead. Expand a comment, why it is safe to rely on parent pointer here (inside hierarchy update). Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Reported-by: Borislav Petkov Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-1-757baa7803fe@linutronix.de --- kernel/time/timer_migration.c | 33 +++++++++++++++------------------ kernel/time/timer_migration.h | 12 +++++++++++- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 84413114db5c..d91efe1dc3bf 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -507,7 +507,14 @@ static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) * (get_next_timer_interrupt()) * @firstexp: Contains the first event expiry information when last * active CPU of hierarchy is on the way to idle to make - * sure CPU will be back in time. + * sure CPU will be back in time. It is updated in top + * level group only. Be aware, there could occur a new top + * level of the hierarchy between the 'top level call' in + * tmigr_update_events() and the check for the parent group + * in walk_groups(). Then @firstexp might contain a value + * != KTIME_MAX even if it was not the final top + * level. This is not a problem, as the worst outcome is a + * CPU which might wake up a little early. * @evt: Pointer to tmigr_event which needs to be queued (of idle * child group) * @childmask: childmask of child group @@ -649,7 +656,7 @@ static bool tmigr_active_up(struct tmigr_group *group, } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); - if ((walk_done == false) && group->parent) + if (walk_done == false) data->childmask = group->childmask; /* @@ -1317,20 +1324,9 @@ static bool tmigr_inactive_up(struct tmigr_group *group, /* Event Handling */ tmigr_update_events(group, child, data); - if (group->parent && (walk_done == false)) + if (walk_done == false) data->childmask = group->childmask; - /* - * data->firstexp was set by tmigr_update_events() and contains the - * expiry of the first global event which needs to be handled. It - * differs from KTIME_MAX if: - * - group is the top level group and - * - group is idle (which means CPU was the last active CPU in the - * hierarchy) and - * - there is a pending event in the hierarchy - */ - WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); - trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); return walk_done; @@ -1552,10 +1548,11 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, data.childmask = child->childmask; /* - * There is only one new level per time. When connecting the - * child and the parent and set the child active when the parent - * is inactive, the parent needs to be the uppermost - * level. Otherwise there went something wrong! + * There is only one new level per time (which is protected by + * tmigr_mutex). When connecting the child and the parent and + * set the child active when the parent is inactive, the parent + * needs to be the uppermost level. Otherwise there went + * something wrong! */ WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 6c37d94a37d9..494f68cc13f4 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -22,7 +22,17 @@ struct tmigr_event { * struct tmigr_group - timer migration hierarchy group * @lock: Lock protecting the event information and group hierarchy * information during setup - * @parent: Pointer to the parent group + * @parent: Pointer to the parent group. Pointer is updated when a + * new hierarchy level is added because of a CPU coming + * online the first time. Once it is set, the pointer will + * not be removed or updated. When accessing parent pointer + * lock less to decide whether to abort a propagation or + * not, it is not a problem. The worst outcome is an + * unnecessary/early CPU wake up. But do not access parent + * pointer several times in the same 'action' (like + * activation, deactivation, check for remote expiry,...) + * without holding the lock as it is not ensured that value + * will not change. * @groupevt: Next event of the group which is only used when the * group is !active. The group event is then queued into * the parent timer queue. -- cgit From 10a0e6f3d3db7dcfe36e578923e5f038f1d2b72a Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 17 Jul 2024 11:49:40 +0200 Subject: timers/migration: Move hierarchy setup into cpuhotplug prepare callback When a CPU comes online the first time, it is possible that a new top level group will be created. In general all propagation is done from the bottom to top. This minimizes complexity and prevents possible races. But when a new top level group is created, the formely top level group needs to be connected to the new level. This is the only time, when the direction to propagate changes is changed: the changes are propagated from top (new top level group) to bottom (formerly top level group). This introduces two races (see (A) and (B)) as reported by Frederic: (A) This race happens, when marking the formely top level group as active, but the last active CPU of the formerly top level group goes idle. Then it's likely that formerly group is no longer active, but marked nevertheless as active in new top level group: [GRP0:0] migrator = 0 active = 0 nextevt = KTIME_MAX / \ 0 1 .. 7 active idle 0) Hierarchy has for now only 8 CPUs and CPU 0 is the only active CPU. [GRP1:0] migrator = TMIGR_NONE active = NONE nextevt = KTIME_MAX \ [GRP0:0] [GRP0:1] migrator = 0 migrator = TMIGR_NONE active = 0 active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 active idle !online 1) CPU 8 is booting and creates a new group in first level GRP0:1 and therefore also a new top group GRP1:0. For now the setup code proceeded only until the connected between GRP0:1 to the new top group. The connection between CPU8 and GRP0:1 is not yet established and CPU 8 is still !online. [GRP1:0] migrator = TMIGR_NONE active = NONE nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0 migrator = TMIGR_NONE active = 0 active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 active idle !online 2) Setup code now connects GRP0:0 to GRP1:0 and observes while in tmigr_connect_child_parent() that GRP0:0 is not TMIGR_NONE. So it prepares to call tmigr_active_up() on it. It hasn't done it yet. [GRP1:0] migrator = TMIGR_NONE active = NONE nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = NONE active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 idle idle !online 3) CPU 0 goes idle. Since GRP0:0->parent has been updated by CPU 8 with GRP0:0->lock held, CPU 0 observes GRP1:0 after calling tmigr_update_events() and it propagates the change to the top (no change there and no wakeup programmed since there is no timer). [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = NONE active = NONE nextevt = KTIME_MAX nextevt = KTIME_MAX / \ 0 1 .. 7 8 idle idle !online 4) Now the setup code finally calls tmigr_active_up() to and sets GRP0:0 active in GRP1:0 [GRP1:0] migrator = GRP0:0 active = GRP0:0, GRP0:1 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = 8 active = NONE active = 8 nextevt = KTIME_MAX nextevt = KTIME_MAX / \ | 0 1 .. 7 8 idle idle active 5) Now CPU 8 is connected with GRP0:1 and CPU 8 calls tmigr_active_up() out of tmigr_cpu_online(). [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T8 / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = NONE active = NONE nextevt = KTIME_MAX nextevt = T8 / \ | 0 1 .. 7 8 idle idle idle 5) CPU 8 goes idle with a timer T8 and relies on GRP0:0 as the migrator. But it's not really active, so T8 gets ignored. --> The update which is done in third step is not noticed by setup code. So a wrong migrator is set to top level group and a timer could get ignored. (B) Reading group->parent and group->childmask when an hierarchy update is ongoing and reaches the formerly top level group is racy as those values could be inconsistent. (The notation of migrator and active now slightly changes in contrast to the above example, as now the childmasks are used.) [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = 0x00 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 0 childmask= 1 parent = NULL parent = GRP1:0 / \ 0 1 .. 7 8 idle idle !online childmask=1 1) Hierarchy has 8 CPUs. CPU 8 is at the moment in the process of onlining but did not yet connect GRP0:0 to GRP1:0. [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = TMIGR_NONE migrator = TMIGR_NONE active = 0x00 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 0 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 idle idle !online childmask=1 2) Setup code (running on CPU 8) now connects GRP0:0 to GRP1:0, updates parent pointer of GRP0:0 and ... [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0x01 migrator = TMIGR_NONE active = 0x01 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 0 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 active idle !online childmask=1 tmigr_walk.childmask = 0 3) ... CPU 0 comes active in the same time. As migrator in GRP0:0 was TMIGR_NONE, childmask of GRP0:0 is stored in update propagation data structure tmigr_walk (as update of childmask is not yet visible/updated). And now ... [GRP1:0] migrator = TMIGR_NONE active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0x01 migrator = TMIGR_NONE active = 0x01 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 2 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 active idle !online childmask=1 tmigr_walk.childmask = 0 4) ... childmask of GRP0:0 is updated by CPU 8 (still part of setup code). [GRP1:0] migrator = 0x00 active = 0x00 nextevt = KTIME_MAX / \ [GRP0:0] [GRP0:1] migrator = 0x01 migrator = TMIGR_NONE active = 0x01 active = 0x00 nextevt = KTIME_MAX nextevt = KTIME_MAX childmask= 2 childmask= 1 parent = GRP1:0 parent = GRP1:0 / \ 0 1 .. 7 8 active idle !online childmask=1 tmigr_walk.childmask = 0 5) CPU 0 sees the connection to GRP1:0 and now propagates active state to GRP1:0 but with childmask = 0 as stored in propagation data structure. --> Now GRP1:0 always has a migrator as 0x00 != TMIGR_NONE and for all CPUs it looks like GRP1:0 is always active. To prevent those races, the setup of the hierarchy is moved into the cpuhotplug prepare callback. The prepare callback is not executed by the CPU which will come online, it is executed by the CPU which prepares onlining of the other CPU. This CPU is active while it is connecting the formerly top level to the new one. This prevents from (A) to happen and it also prevents from any further walk above the formerly top level until that active CPU becomes inactive, releasing the new ->parent and ->childmask updates to be visible by any subsequent walk up above the formerly top level hierarchy. This prevents from (B) to happen. The direction for the updates is now forced to look like "from bottom to top". However if the active CPU prevents from tmigr_cpu_(in)active() to walk up with the update not-or-half visible, nothing prevents walking up to the new top with a 0 childmask in tmigr_handle_remote_up() or tmigr_requires_handle_remote_up() if the active CPU doing the prepare is not the migrator. But then it looks fine because: * tmigr_check_migrator() should just return false * The migrator is active and should eventually observe the new childmask at some point in a future tick. Split setup functionality of online callback into the cpuhotplug prepare callback and setup hotplug state. Change init call into early_initcall() to make sure an already active CPU prepares everything for newly upcoming CPUs. Reorder the code, that all prepare related functions are close to each other and online and offline callbacks are also close together. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240717094940.18687-1-anna-maria@linutronix.de --- include/linux/cpuhotplug.h | 1 + kernel/time/timer_migration.c | 206 ++++++++++++++++++++++++------------------ 2 files changed, 120 insertions(+), 87 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7a5785f405b6..df59666a2a66 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -122,6 +122,7 @@ enum cpuhp_state { CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_PREPARE, + CPUHP_TMIGR_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index d91efe1dc3bf..867f0ecf2e74 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1438,6 +1438,66 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; } +/* + * tmigr_trigger_active() - trigger a CPU to become active again + * + * This function is executed on a CPU which is part of cpu_online_mask, when the + * last active CPU in the hierarchy is offlining. With this, it is ensured that + * the other CPU is active and takes over the migrator duty. + */ +static long tmigr_trigger_active(void *unused) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + WARN_ON_ONCE(!tmc->online || tmc->idle); + + return 0; +} + +static int tmigr_cpu_offline(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + int migrator; + u64 firstexp; + + raw_spin_lock_irq(&tmc->lock); + tmc->online = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_offline(tmc); + raw_spin_unlock_irq(&tmc->lock); + + if (firstexp != KTIME_MAX) { + migrator = cpumask_any_but(cpu_online_mask, cpu); + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } + + return 0; +} + +static int tmigr_cpu_online(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + /* Check whether CPU data was successfully initialized */ + if (WARN_ON_ONCE(!tmc->tmgroup)) + return -EINVAL; + + raw_spin_lock_irq(&tmc->lock); + trace_tmigr_cpu_online(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->online = true; + raw_spin_unlock_irq(&tmc->lock); + return 0; +} + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1510,9 +1570,10 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, } static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent) + struct tmigr_group *parent, + bool activate) { - union tmigr_state childstate; + struct tmigr_walk data; raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); @@ -1525,6 +1586,9 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, trace_tmigr_connect_child_parent(child); + if (!activate) + return; + /* * To prevent inconsistent states, active children need to be active in * the new parent as well. Inactive children are already marked inactive @@ -1540,22 +1604,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, * child to the new parent. So tmigr_connect_child_parent() is * executed with the formerly top level group (child) and the newly * created group (parent). + * + * * It is ensured that the child is active, as this setup path is + * executed in hotplug prepare callback. This is exectued by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - childstate.state = atomic_read(&child->migr_state); - if (childstate.migrator != TMIGR_NONE) { - struct tmigr_walk data; - - data.childmask = child->childmask; + data.childmask = child->childmask; - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and - * set the child active when the parent is inactive, the parent - * needs to be the uppermost level. Otherwise there went - * something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); - } + /* + * There is only one new level per time (which is protected by + * tmigr_mutex). When connecting the child and the parent and set the + * child active when the parent is inactive, the parent needs to be the + * uppermost level. Otherwise there went something wrong! + */ + WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } static int tmigr_setup_groups(unsigned int cpu, unsigned int node) @@ -1608,7 +1674,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * Update tmc -> group / child -> group connection */ if (i == 0) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); raw_spin_lock_irq(&group->lock); @@ -1623,7 +1689,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group); + /* Will be activated at online time */ + tmigr_connect_child_parent(child, group, false); } /* check if uppermost level was newly created */ @@ -1634,12 +1701,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) lvllist = &tmigr_level_list[top]; if (group->num_children == 1 && list_is_singular(lvllist)) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + lvllist = &tmigr_level_list[top - 1]; list_for_each_entry(child, lvllist, list) { if (child->parent) continue; - tmigr_connect_child_parent(child, group); + tmigr_connect_child_parent(child, group, true); } } } @@ -1661,80 +1737,31 @@ static int tmigr_add_cpu(unsigned int cpu) return ret; } -static int tmigr_cpu_online(unsigned int cpu) +static int tmigr_cpu_prepare(unsigned int cpu) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int ret; - - /* First online attempt? Initialize CPU data */ - if (!tmc->tmgroup) { - raw_spin_lock_init(&tmc->lock); - - ret = tmigr_add_cpu(cpu); - if (ret < 0) - return ret; - - if (tmc->childmask == 0) - return -EINVAL; - - timerqueue_init(&tmc->cpuevt.nextevt); - tmc->cpuevt.nextevt.expires = KTIME_MAX; - tmc->cpuevt.ignore = true; - tmc->cpuevt.cpu = cpu; - - tmc->remote = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); - } - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); - return 0; -} - -/* - * tmigr_trigger_active() - trigger a CPU to become active again - * - * This function is executed on a CPU which is part of cpu_online_mask, when the - * last active CPU in the hierarchy is offlining. With this, it is ensured that - * the other CPU is active and takes over the migrator duty. - */ -static long tmigr_trigger_active(void *unused) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); + int ret = 0; - WARN_ON_ONCE(!tmc->online || tmc->idle); - - return 0; -} - -static int tmigr_cpu_offline(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int migrator; - u64 firstexp; + /* Not first online attempt? */ + if (tmc->tmgroup) + return ret; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; + raw_spin_lock_init(&tmc->lock); + timerqueue_init(&tmc->cpuevt.nextevt); + tmc->cpuevt.nextevt.expires = KTIME_MAX; + tmc->cpuevt.ignore = true; + tmc->cpuevt.cpu = cpu; + tmc->remote = false; WRITE_ONCE(tmc->wakeup, KTIME_MAX); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + ret = tmigr_add_cpu(cpu); + if (ret < 0) + return ret; - if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); - work_on_cpu(migrator, tmigr_trigger_active, NULL); - } + if (tmc->childmask == 0) + return -EINVAL; - return 0; + return ret; } static int __init tmigr_init(void) @@ -1793,6 +1820,11 @@ static int __init tmigr_init(void) tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, tmigr_crossnode_level); + ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", + tmigr_cpu_prepare, NULL); + if (ret) + goto err; + ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", tmigr_cpu_online, tmigr_cpu_offline); if (ret) @@ -1804,4 +1836,4 @@ err: pr_err("Timer migration setup failed\n"); return ret; } -late_initcall(tmigr_init); +early_initcall(tmigr_init); -- cgit From 92506741521fd09dfaa9d6ef3c3620a9dd6bbafd Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:21 +0200 Subject: timers/migration: Improve tracing Trace points of inactive and active propagation are located at the end of the related functions. The interesting information of those trace points is the updated group state. When trace points are not located directly at the place where group state changed, order of trace points in traces could be confusing. Move inactive and active propagation trace points directly after update of group state values. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-3-757baa7803fe@linutronix.de --- kernel/time/timer_migration.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 867f0ecf2e74..4fbd9304e896 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -656,6 +656,8 @@ static bool tmigr_active_up(struct tmigr_group *group, } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); + trace_tmigr_group_set_cpu_active(group, newstate, childmask); + if (walk_done == false) data->childmask = group->childmask; @@ -673,8 +675,6 @@ static bool tmigr_active_up(struct tmigr_group *group, */ group->groupevt.ignore = true; - trace_tmigr_group_set_cpu_active(group, newstate, childmask); - return walk_done; } @@ -1306,9 +1306,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group, WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); - if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, - newstate.state)) + if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { + trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); break; + } /* * The memory barrier is paired with the cmpxchg() in @@ -1327,8 +1328,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group, if (walk_done == false) data->childmask = group->childmask; - trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); - return walk_done; } -- cgit From 3ba111032bc1d8a0f04e6d2a5d8fb4ddc96eeae7 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:22 +0200 Subject: timers/migration: Use a single struct for hierarchy walk data Two different structs are defined for propagating data from one to another level when walking the hierarchy. Several struct members exist in both structs which makes generalization harder. Merge those two structs into a single one and use it directly in walk_groups() and the corresponding function pointers instead of introducing pointer casting all over the place. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-4-757baa7803fe@linutronix.de --- kernel/time/timer_migration.c | 126 ++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 71 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 4fbd9304e896..9f0c284c440f 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -475,69 +475,31 @@ static bool tmigr_check_lonely(struct tmigr_group *group) return bitmap_weight(&active, BIT_CNT) <= 1; } -typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); - -static void __walk_groups(up_f up, void *data, - struct tmigr_cpu *tmc) -{ - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - - do { - WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); - - if (up(group, child, data)) - break; - - child = group; - group = group->parent; - } while (group); -} - -static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) -{ - lockdep_assert_held(&tmc->lock); - - __walk_groups(up, data, tmc); -} - /** * struct tmigr_walk - data required for walking the hierarchy * @nextexp: Next CPU event expiry information which is handed into * the timer migration code by the timer code * (get_next_timer_interrupt()) - * @firstexp: Contains the first event expiry information when last - * active CPU of hierarchy is on the way to idle to make - * sure CPU will be back in time. It is updated in top - * level group only. Be aware, there could occur a new top - * level of the hierarchy between the 'top level call' in - * tmigr_update_events() and the check for the parent group - * in walk_groups(). Then @firstexp might contain a value - * != KTIME_MAX even if it was not the final top - * level. This is not a problem, as the worst outcome is a - * CPU which might wake up a little early. + * @firstexp: Contains the first event expiry information when + * hierarchy is completely idle. When CPU itself was the + * last going idle, information makes sure, that CPU will + * be back in time. When using this value in the remote + * expiry case, firstexp is stored in the per CPU tmigr_cpu + * struct of CPU which expires remote timers. It is updated + * in top level group only. Be aware, there could occur a + * new top level of the hierarchy between the 'top level + * call' in tmigr_update_events() and the check for the + * parent group in walk_groups(). Then @firstexp might + * contain a value != KTIME_MAX even if it was not the + * final top level. This is not a problem, as the worst + * outcome is a CPU which might wake up a little early. * @evt: Pointer to tmigr_event which needs to be queued (of idle * child group) * @childmask: childmask of child group * @remote: Is set, when the new timer path is executed in * tmigr_handle_remote_cpu() - */ -struct tmigr_walk { - u64 nextexp; - u64 firstexp; - struct tmigr_event *evt; - u8 childmask; - bool remote; -}; - -/** - * struct tmigr_remote_data - data required for remote expiry hierarchy walk * @basej: timer base in jiffies * @now: timer base monotonic - * @firstexp: returns expiry of the first timer in the idle timer - * migration hierarchy to make sure the timer is handled in - * time; it is stored in the per CPU tmigr_cpu struct of - * CPU which expires remote timers - * @childmask: childmask of child group * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only * @tmc_active: this flag indicates, whether the CPU which triggers @@ -546,15 +508,43 @@ struct tmigr_walk { * idle, only the first event of the top level has to be * considered. */ -struct tmigr_remote_data { - unsigned long basej; - u64 now; - u64 firstexp; - u8 childmask; - bool check; - bool tmc_active; +struct tmigr_walk { + u64 nextexp; + u64 firstexp; + struct tmigr_event *evt; + u8 childmask; + bool remote; + unsigned long basej; + u64 now; + bool check; + bool tmc_active; }; +typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); + +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + struct tmigr_group *child = NULL, *group = tmc->tmgroup; + + do { + WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); + + if (up(group, child, data)) + break; + + child = group; + group = group->parent; + } while (group); +} + +static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) +{ + lockdep_assert_held(&tmc->lock); + + __walk_groups(up, data, tmc); +} + /* * Returns the next event of the timerqueue @group->events * @@ -625,10 +615,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) static bool tmigr_active_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -867,10 +856,8 @@ unlock: static bool tmigr_new_timer_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_walk *data = ptr; - return tmigr_update_events(group, child, data); } @@ -1002,9 +989,8 @@ unlock: static bool tmigr_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; struct tmigr_event *evt; unsigned long jif; u8 childmask; @@ -1062,7 +1048,7 @@ again: void tmigr_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; if (tmigr_is_not_available(tmc)) return; @@ -1104,9 +1090,8 @@ void tmigr_handle_remote(void) static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; u8 childmask; childmask = data->childmask; @@ -1164,7 +1149,7 @@ out: bool tmigr_requires_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; unsigned long jif; bool ret = false; @@ -1252,10 +1237,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp) static bool tmigr_inactive_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate, childstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; -- cgit From d47be589844224a3ef13b55ff6f15211ab20f1d1 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:23 +0200 Subject: timers/migration: Read childmask and parent pointer in a single place Reading the childmask and parent pointer is required when propagating changes through the hierarchy. At the moment this reads are spread all over the place which makes it harder to follow. Move those reads to a single place to keep code clean. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-5-757baa7803fe@linutronix.de --- kernel/time/timer_migration.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 9f0c284c440f..f5652b0aa90e 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -535,6 +535,7 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, child = group; group = group->parent; + data->childmask = child->childmask; } while (group); } @@ -647,9 +648,6 @@ static bool tmigr_active_up(struct tmigr_group *group, trace_tmigr_group_set_cpu_active(group, newstate, childmask); - if (walk_done == false) - data->childmask = group->childmask; - /* * The group is active (again). The group event might be still queued * into the parent group's timerqueue but can now be handled by the @@ -1027,12 +1025,10 @@ again: } /* - * Update of childmask for the next level and keep track of the expiry - * of the first event that needs to be handled (group->next_expiry was - * updated by tmigr_next_expired_groupevt(), next was set by - * tmigr_handle_remote_cpu()). + * Keep track of the expiry of the first event that needs to be handled + * (group->next_expiry was updated by tmigr_next_expired_groupevt(), + * next was set by tmigr_handle_remote_cpu()). */ - data->childmask = group->childmask; data->firstexp = group->next_expiry; raw_spin_unlock_irq(&group->lock); @@ -1110,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, * group before reading the next_expiry value. */ if (group->parent && !data->tmc_active) - goto out; + return false; /* * The lock is required on 32bit architectures to read the variable @@ -1135,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, raw_spin_unlock(&group->lock); } -out: - /* Update of childmask for the next level */ - data->childmask = group->childmask; return false; } @@ -1309,9 +1302,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group, /* Event Handling */ tmigr_update_events(group, child, data); - if (walk_done == false) - data->childmask = group->childmask; - return walk_done; } -- cgit From 835a9a67f54f01033417a254e53a1391f99db708 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:24 +0200 Subject: timers/migration: Rename childmask by groupmask to make naming more obvious childmask in the group reflects the mask that is required to 'reference' this group in the parent. When reading childmask, this might be confusing, as this suggests, that this is the mask of the child of the group. Clarify this by renaming childmask in the tmigr_group and tmc_group by groupmask. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-6-757baa7803fe@linutronix.de --- include/trace/events/timer_migration.h | 16 ++++++++-------- kernel/time/timer_migration.c | 24 ++++++++++++------------ kernel/time/timer_migration.h | 15 +++++++-------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h index 79f19e76a80b..47db5eaf2f9a 100644 --- a/include/trace/events/timer_migration.h +++ b/include/trace/events/timer_migration.h @@ -43,7 +43,7 @@ TRACE_EVENT(tmigr_connect_child_parent, __field( unsigned int, lvl ) __field( unsigned int, numa_node ) __field( unsigned int, num_children ) - __field( u32, childmask ) + __field( u32, groupmask ) ), TP_fast_assign( @@ -52,11 +52,11 @@ TRACE_EVENT(tmigr_connect_child_parent, __entry->lvl = child->parent->level; __entry->numa_node = child->parent->numa_node; __entry->num_children = child->parent->num_children; - __entry->childmask = child->childmask; + __entry->groupmask = child->groupmask; ), - TP_printk("group=%p childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", - __entry->child, __entry->childmask, __entry->parent, + TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d", + __entry->child, __entry->groupmask, __entry->parent, __entry->lvl, __entry->numa_node, __entry->num_children) ); @@ -72,7 +72,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent, __field( unsigned int, lvl ) __field( unsigned int, numa_node ) __field( unsigned int, num_children ) - __field( u32, childmask ) + __field( u32, groupmask ) ), TP_fast_assign( @@ -81,11 +81,11 @@ TRACE_EVENT(tmigr_connect_cpu_parent, __entry->lvl = tmc->tmgroup->level; __entry->numa_node = tmc->tmgroup->numa_node; __entry->num_children = tmc->tmgroup->num_children; - __entry->childmask = tmc->childmask; + __entry->groupmask = tmc->groupmask; ), - TP_printk("cpu=%d childmask=%0x parent=%p lvl=%d numa=%d num_children=%d", - __entry->cpu, __entry->childmask, __entry->parent, + TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d", + __entry->cpu, __entry->groupmask, __entry->parent, __entry->lvl, __entry->numa_node, __entry->num_children) ); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index f5652b0aa90e..ca76120ee7c6 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -495,7 +495,7 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * outcome is a CPU which might wake up a little early. * @evt: Pointer to tmigr_event which needs to be queued (of idle * child group) - * @childmask: childmask of child group + * @childmask: groupmask of child group * @remote: Is set, when the new timer path is executed in * tmigr_handle_remote_cpu() * @basej: timer base in jiffies @@ -535,7 +535,7 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, child = group; group = group->parent; - data->childmask = child->childmask; + data->childmask = child->groupmask; } while (group); } @@ -669,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) { struct tmigr_walk data; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; trace_tmigr_cpu_active(tmc); @@ -1049,7 +1049,7 @@ void tmigr_handle_remote(void) if (tmigr_is_not_available(tmc)) return; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; /* @@ -1057,7 +1057,7 @@ void tmigr_handle_remote(void) * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ - if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { + if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { /* * If this CPU was an idle migrator, make sure to clear its wakeup * value so it won't chase timers that have already expired elsewhere. @@ -1150,7 +1150,7 @@ bool tmigr_requires_handle_remote(void) return ret; data.now = get_jiffies_update(&jif); - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; data.tmc_active = !tmc->idle; data.check = false; @@ -1310,7 +1310,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) struct tmigr_walk data = { .nextexp = nextexp, .firstexp = KTIME_MAX, .evt = &tmc->cpuevt, - .childmask = tmc->childmask }; + .childmask = tmc->groupmask }; /* * If nextexp is KTIME_MAX, the CPU event will be ignored because the @@ -1388,7 +1388,7 @@ u64 tmigr_quick_check(u64 nextevt) if (WARN_ON_ONCE(tmc->idle)) return nextevt; - if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) + if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) return KTIME_MAX; do { @@ -1552,7 +1552,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); child->parent = parent; - child->childmask = BIT(parent->num_children++); + child->groupmask = BIT(parent->num_children++); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); @@ -1586,7 +1586,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, * the new childmask and parent to subsequent walkers through this * @child. Therefore propagate active state unconditionally. */ - data.childmask = child->childmask; + data.childmask = child->groupmask; /* * There is only one new level per time (which is protected by @@ -1652,7 +1652,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) raw_spin_lock_irq(&group->lock); tmc->tmgroup = group; - tmc->childmask = BIT(group->num_children++); + tmc->groupmask = BIT(group->num_children++); raw_spin_unlock_irq(&group->lock); @@ -1731,7 +1731,7 @@ static int tmigr_cpu_prepare(unsigned int cpu) if (ret < 0) return ret; - if (tmc->childmask == 0) + if (tmc->groupmask == 0) return -EINVAL; return ret; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 494f68cc13f4..154accc7a543 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -51,9 +51,8 @@ struct tmigr_event { * @num_children: Counter of group children to make sure the group is only * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup * only - * @childmask: childmask of the group in the parent group; is set - * during setup and will never change; can be read - * lockless + * @groupmask: mask of the group in the parent group; is set during + * setup and will never change; can be read lockless * @list: List head that is added to the per level * tmigr_level_list; is required during setup when a * new group needs to be connected to the existing @@ -69,7 +68,7 @@ struct tmigr_group { unsigned int level; int numa_node; unsigned int num_children; - u8 childmask; + u8 groupmask; struct list_head list; }; @@ -89,7 +88,7 @@ struct tmigr_group { * hierarchy * @remote: Is set when timers of the CPU are expired remotely * @tmgroup: Pointer to the parent group - * @childmask: childmask of tmigr_cpu in the parent group + * @groupmask: mask of tmigr_cpu in the parent group * @wakeup: Stores the first timer when the timer migration * hierarchy is completely idle and remote expiry was done; * is returned to timer code in the idle path and is only @@ -102,7 +101,7 @@ struct tmigr_cpu { bool idle; bool remote; struct tmigr_group *tmgroup; - u8 childmask; + u8 groupmask; u64 wakeup; struct tmigr_event cpuevt; }; @@ -118,8 +117,8 @@ union tmigr_state { u32 state; /** * struct - split state of tmigr_group - * @active: Contains each childmask bit of the active children - * @migrator: Contains childmask of the child which is migrator + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator * @seq: Sequence counter needs to be increased when an update * to the tmigr_state is done. It prevents a race when * updates in the child groups are propagated in changed -- cgit From 2367e28e231af05243b92325de9a38956ad0b565 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:25 +0200 Subject: timers/migration: Spare write when nothing changed The wakeup value is written unconditionally in tmigr_cpu_new_timer(). When there was no new next timer expiry that needs to be propagated, then the value that was read before is written. This is not required. Move the write to the place where wakeup value is changed changed. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-7-757baa7803fe@linutronix.de --- kernel/time/timer_migration.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index ca76120ee7c6..9c15ae8d8a25 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1215,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp) if (nextexp != tmc->cpuevt.nextevt.expires || tmc->cpuevt.ignore) { ret = tmigr_new_timer(tmc, nextexp); + /* + * Make sure the reevaluation of timers in idle path + * will not miss an event. + */ + WRITE_ONCE(tmc->wakeup, ret); } } - /* - * Make sure the reevaluation of timers in idle path will not miss an - * event. - */ - WRITE_ONCE(tmc->wakeup, ret); - trace_tmigr_cpu_new_timer_idle(tmc, nextexp); raw_spin_unlock(&tmc->lock); return ret; -- cgit From f004bf9de057004f7ccea4239317aec2fbd8240b Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 16 Jul 2024 16:19:26 +0200 Subject: timers/migration: Fix grammar in comment Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-8-757baa7803fe@linutronix.de --- kernel/time/timer_migration.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 9c15ae8d8a25..8d57f7686bb0 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1368,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp) * the only one in the level 0 group; and if it is the * only one in level 0 group, but there are more than a * single group active on the way to top level) - * * nextevt - when CPU is offline and has to handle timer on his own + * * nextevt - when CPU is offline and has to handle timer on its own * or when on the way to top in every group only a single * child is active but @nextevt is before the lowest * next_expiry encountered while walking up to top level. -- cgit From 3d8d459c8ba282a67df3b518c5f80bcb217a5b38 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 17 Jul 2024 09:54:38 +0100 Subject: RISC-V: hwprobe: sort EXT_KEY()s in hwprobe_isa_ext0() alphabetically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the entries appear to be in a random order (although according to Palmer he has tried to sort them by key value) which makes it harder to find entries in a growing list, and more likely to have conflicts as all patches are adding to the end of the list. Sort them alphabetically instead. Signed-off-by: Conor Dooley Reviewed-by: Clément Léger Link: https://lore.kernel.org/r/20240717-dedicate-squeamish-7e4ab54df58f@spud Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sys_hwprobe.c | 43 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 685594769535..8a1c9ce170e8 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -93,44 +93,45 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, * regardless of the kernel's configuration, as no other checks, besides * presence in the hart_isa bitmap, are made. */ + EXT_KEY(ZACAS); + EXT_KEY(ZAWRS); EXT_KEY(ZBA); EXT_KEY(ZBB); - EXT_KEY(ZBS); - EXT_KEY(ZICBOZ); EXT_KEY(ZBC); - EXT_KEY(ZBKB); EXT_KEY(ZBKC); EXT_KEY(ZBKX); + EXT_KEY(ZBS); + EXT_KEY(ZCA); + EXT_KEY(ZCB); + EXT_KEY(ZCMOP); + EXT_KEY(ZICBOZ); + EXT_KEY(ZICOND); + EXT_KEY(ZIHINTNTL); + EXT_KEY(ZIHINTPAUSE); + EXT_KEY(ZIMOP); EXT_KEY(ZKND); EXT_KEY(ZKNE); EXT_KEY(ZKNH); EXT_KEY(ZKSED); EXT_KEY(ZKSH); EXT_KEY(ZKT); - EXT_KEY(ZIHINTNTL); EXT_KEY(ZTSO); - EXT_KEY(ZACAS); - EXT_KEY(ZICOND); - EXT_KEY(ZIHINTPAUSE); - EXT_KEY(ZIMOP); - EXT_KEY(ZCA); - EXT_KEY(ZCB); - EXT_KEY(ZCMOP); - EXT_KEY(ZAWRS); /* * All the following extensions must depend on the kernel * support of V. */ if (has_vector()) { - EXT_KEY(ZVE32X); - EXT_KEY(ZVE32F); - EXT_KEY(ZVE64X); - EXT_KEY(ZVE64F); - EXT_KEY(ZVE64D); EXT_KEY(ZVBB); EXT_KEY(ZVBC); + EXT_KEY(ZVE32F); + EXT_KEY(ZVE32X); + EXT_KEY(ZVE64D); + EXT_KEY(ZVE64F); + EXT_KEY(ZVE64X); + EXT_KEY(ZVFH); + EXT_KEY(ZVFHMIN); EXT_KEY(ZVKB); EXT_KEY(ZVKG); EXT_KEY(ZVKNED); @@ -139,16 +140,14 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZVKSED); EXT_KEY(ZVKSH); EXT_KEY(ZVKT); - EXT_KEY(ZVFH); - EXT_KEY(ZVFHMIN); } if (has_fpu()) { - EXT_KEY(ZFH); - EXT_KEY(ZFHMIN); - EXT_KEY(ZFA); EXT_KEY(ZCD); EXT_KEY(ZCF); + EXT_KEY(ZFA); + EXT_KEY(ZFH); + EXT_KEY(ZFHMIN); } #undef EXT_KEY } -- cgit From 82b461680651ac452fa773f271000ddbc4e4db14 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 17 Jul 2024 14:29:24 +0100 Subject: RISC-V: run savedefconfig for defconfig It's been a while since this was run, and there's a few things that have changed. Firstly, almost all of the Renesas stuff vanishes because the config for the RZ/Five is gated behind NONPORTABLE. Several options (like CONFIG_PM) are removed as they are the default values. To retain DEFVFREQ_THERMAL and BLK_DEV_THROTTLING, add PM_DEVFREQ and BLK_CGROUP respectively. Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20240717-shrubs-concise-51600886babf@spud Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 3f1f055866af..0d678325444f 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -7,6 +7,7 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y CONFIG_MEMCG=y +CONFIG_BLK_CGROUP=y CONFIG_CGROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y @@ -35,9 +36,6 @@ CONFIG_ARCH_THEAD=y CONFIG_ARCH_VIRT=y CONFIG_ARCH_CANAAN=y CONFIG_SMP=y -CONFIG_HOTPLUG_CPU=y -CONFIG_PM=y -CONFIG_CPU_IDLE=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_STAT=y CONFIG_CPU_FREQ_GOV_POWERSAVE=m @@ -52,13 +50,11 @@ CONFIG_ACPI=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_SPARSEMEM_MANUAL=y CONFIG_BLK_DEV_THROTTLING=y +CONFIG_SPARSEMEM_MANUAL=y CONFIG_NET=y CONFIG_PACKET=y -CONFIG_UNIX=y CONFIG_XFRM_USER=m -CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_PNP=y @@ -102,9 +98,9 @@ CONFIG_NET_SCHED=y CONFIG_NET_CLS_CGROUP=m CONFIG_NETLINK_DIAG=y CONFIG_CGROUP_NET_PRIO=y +CONFIG_CAN=m CONFIG_NET_9P=y CONFIG_NET_9P_VIRTIO=y -CONFIG_CAN=m CONFIG_PCI=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_HOST_GENERIC=y @@ -153,8 +149,8 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_DW=y CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_EARLYCON_RISCV_SBI=y +CONFIG_SERIAL_SH_SCI=y CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=y @@ -179,7 +175,6 @@ CONFIG_DEVFREQ_THERMAL=y CONFIG_RZG2L_THERMAL=y CONFIG_WATCHDOG=y CONFIG_SUNXI_WATCHDOG=y -CONFIG_RENESAS_RZG2LWDT=y CONFIG_MFD_AXP20X_I2C=y CONFIG_REGULATOR=y CONFIG_REGULATOR_FIXED_VOLTAGE=y @@ -193,11 +188,9 @@ CONFIG_DRM_NOUVEAU=m CONFIG_DRM_SUN4I=m CONFIG_DRM_VIRTIO_GPU=m CONFIG_FB=y -CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_SOUND=y CONFIG_SND=y CONFIG_SND_SOC=y -CONFIG_SND_SOC_RZ=m CONFIG_SND_DESIGNWARE_I2S=m CONFIG_SND_SOC_STARFIVE=m CONFIG_SND_SOC_JH7110_PWMDAC=m @@ -239,34 +232,31 @@ CONFIG_USB_CONFIGFS_F_FS=y CONFIG_MMC=y CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y -CONFIG_MMC_SDHCI_CADENCE=y CONFIG_MMC_SDHCI_OF_DWCMSHC=y +CONFIG_MMC_SDHCI_CADENCE=y CONFIG_MMC_SPI=y +CONFIG_MMC_SDHI=y CONFIG_MMC_DW=y CONFIG_MMC_DW_STARFIVE=y -CONFIG_MMC_SDHI=y CONFIG_MMC_SUNXI=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_SUN6I=y CONFIG_DMADEVICES=y CONFIG_DMA_SUN6I=m CONFIG_DW_AXI_DMAC=y -CONFIG_RZ_DMAC=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BALLOON=y CONFIG_VIRTIO_INPUT=y CONFIG_VIRTIO_MMIO=y -CONFIG_RENESAS_OSTM=y CONFIG_CLK_SOPHGO_CV1800=y CONFIG_SUN8I_DE2_CCU=m +CONFIG_RENESAS_OSTM=y CONFIG_SUN50I_IOMMU=y CONFIG_RPMSG_CHAR=y CONFIG_RPMSG_CTRL=y CONFIG_RPMSG_VIRTIO=y -CONFIG_ARCH_R9A07G043=y +CONFIG_PM_DEVFREQ=y CONFIG_IIO=y -CONFIG_RZG2L_ADC=m -CONFIG_RESET_RZG2L_USBPHY_CTRL=y CONFIG_PHY_SUN4I_USB=m CONFIG_PHY_RCAR_GEN3_USB2=y CONFIG_PHY_STARFIVE_JH7110_DPHY_RX=m -- cgit From 0e91ac701c69c44840e8d36566604cac1b638880 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 19 Jul 2024 08:10:27 -0700 Subject: clk: T-Head: Disable on 32-bit Targets This fails to build on 32-bit targets because of a missing __udivdi3. IIRC the right way to fix that is to avoid the division, but I just want a tree that builds and the only real T-Head platforms are 64-bit right now. Signed-off-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20240719151027.16152-1-palmer@rivosinc.com Acked-by: Drew Fustini Signed-off-by: Stephen Boyd --- drivers/clk/thead/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/thead/Kconfig b/drivers/clk/thead/Kconfig index 1710d50bf9d4..95e0d9eb965e 100644 --- a/drivers/clk/thead/Kconfig +++ b/drivers/clk/thead/Kconfig @@ -3,6 +3,7 @@ config CLK_THEAD_TH1520_AP bool "T-HEAD TH1520 AP clock support" depends on ARCH_THEAD || COMPILE_TEST + depends on 64BIT default ARCH_THEAD select REGMAP_MMIO help -- cgit From 23c996fc2bc1978a02c64eddb90b4ab5d309c8df Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 19 Jul 2024 09:15:18 -0700 Subject: riscv: Extend cpufeature.c to detect vendor extensions Instead of grouping all vendor extensions into the same riscv_isa_ext that standard instructions use, create a struct "riscv_isa_vendor_ext_data_list" that allows each vendor to maintain their vendor extensions independently of the standard extensions. xandespmu is currently the only vendor extension so that is the only extension that is affected by this change. An additional benefit of this is that the extensions of each vendor can be conditionally enabled. A config RISCV_ISA_VENDOR_EXT_ANDES has been added to allow for that. Signed-off-by: Charlie Jenkins Reviewed-by: Conor Dooley Reviewed-by: Andy Chiu Tested-by: Yu Chien Peter Lin Reviewed-by: Yu Chien Peter Lin Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-1-0af7587bbec0@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 + arch/riscv/Kconfig.vendor | 19 +++ arch/riscv/errata/andes/errata.c | 3 + arch/riscv/errata/sifive/errata.c | 3 + arch/riscv/errata/thead/errata.c | 3 + arch/riscv/include/asm/cpufeature.h | 25 ++++ arch/riscv/include/asm/hwcap.h | 25 ++-- arch/riscv/include/asm/vendor_extensions.h | 49 ++++++++ arch/riscv/include/asm/vendor_extensions/andes.h | 19 +++ arch/riscv/kernel/Makefile | 2 + arch/riscv/kernel/cpufeature.c | 143 ++++++++++++++++------- arch/riscv/kernel/vendor_extensions.c | 56 +++++++++ arch/riscv/kernel/vendor_extensions/Makefile | 3 + arch/riscv/kernel/vendor_extensions/andes.c | 18 +++ drivers/perf/riscv_pmu_sbi.c | 10 +- 15 files changed, 324 insertions(+), 56 deletions(-) create mode 100644 arch/riscv/Kconfig.vendor create mode 100644 arch/riscv/include/asm/vendor_extensions.h create mode 100644 arch/riscv/include/asm/vendor_extensions/andes.h create mode 100644 arch/riscv/kernel/vendor_extensions.c create mode 100644 arch/riscv/kernel/vendor_extensions/Makefile create mode 100644 arch/riscv/kernel/vendor_extensions/andes.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a96a8dadf662..2b12d97eca63 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -808,6 +808,8 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS endchoice +source "arch/riscv/Kconfig.vendor" + endmenu # "Platform type" menu "Kernel features" diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor new file mode 100644 index 000000000000..6f1cdd32ed29 --- /dev/null +++ b/arch/riscv/Kconfig.vendor @@ -0,0 +1,19 @@ +menu "Vendor extensions" + +config RISCV_ISA_VENDOR_EXT + bool + +menu "Andes" +config RISCV_ISA_VENDOR_EXT_ANDES + bool "Andes vendor extension support" + select RISCV_ISA_VENDOR_EXT + default y + help + Say N here if you want to disable all Andes vendor extension + support. This will cause any Andes vendor extensions that are + requested by hardware probing to be ignored. + + If you don't know what to do here, say Y. +endmenu + +endmenu diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c index f2708a9494a1..fc1a34faa5f3 100644 --- a/arch/riscv/errata/andes/errata.c +++ b/arch/riscv/errata/andes/errata.c @@ -17,6 +17,7 @@ #include #include #include +#include #define ANDES_AX45MP_MARCHID 0x8000000000008a45UL #define ANDES_AX45MP_MIMPID 0x500UL @@ -65,6 +66,8 @@ void __init_or_module andes_errata_patch_func(struct alt_entry *begin, struct al unsigned long archid, unsigned long impid, unsigned int stage) { + BUILD_BUG_ON(ERRATA_ANDES_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + if (stage == RISCV_ALTERNATIVES_BOOT) errata_probe_iocp(stage, archid, impid); diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index 716cfedad3a2..cea3b96ade11 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -12,6 +12,7 @@ #include #include #include +#include struct errata_info_t { char name[32]; @@ -96,6 +97,8 @@ void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, u32 cpu_apply_errata = 0; u32 tmp; + BUILD_BUG_ON(ERRATA_SIFIVE_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) return; diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index bf6a0a6318ee..f5120e07c318 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -18,6 +18,7 @@ #include #include #include +#include #define CSR_TH_SXSTATUS 0x5c0 #define SXSTATUS_MAEE _AC(0x200000, UL) @@ -166,6 +167,8 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, u32 tmp; void *oldptr, *altptr; + BUILD_BUG_ON(ERRATA_THEAD_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + for (alt = begin; alt < end; alt++) { if (alt->vendor_id != THEAD_VENDOR_ID) continue; diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 000796c2d0b1..bfe7c0b881e9 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -33,6 +33,31 @@ extern struct riscv_isainfo hart_isa[NR_CPUS]; void riscv_user_isa_enable(void); +#define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, _validate) { \ + .name = #_name, \ + .property = #_name, \ + .id = _id, \ + .subset_ext_ids = _subset_exts, \ + .subset_ext_size = _subset_exts_size, \ + .validate = _validate \ +} + +#define __RISCV_ISA_EXT_DATA(_name, _id) _RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, NULL) + +#define __RISCV_ISA_EXT_DATA_VALIDATE(_name, _id, _validate) \ + _RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, _validate) + +/* Used to declare pure "lasso" extension (Zk for instance) */ +#define __RISCV_ISA_EXT_BUNDLE(_name, _bundled_exts) \ + _RISCV_ISA_EXT_DATA(_name, RISCV_ISA_EXT_INVALID, _bundled_exts, \ + ARRAY_SIZE(_bundled_exts), NULL) + +/* Used to declare extensions that are a superset of other extensions (Zvbb for instance) */ +#define __RISCV_ISA_EXT_SUPERSET(_name, _id, _sub_exts) \ + _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), NULL) +#define __RISCV_ISA_EXT_SUPERSET_VALIDATE(_name, _id, _sub_exts, _validate) \ + _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate) + #if defined(CONFIG_RISCV_MISALIGNED) bool check_unaligned_access_emulated_all_cpus(void); void unaligned_emulation_finish(void); diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index b18b202ca141..5a0bd27fd11a 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -80,19 +80,18 @@ #define RISCV_ISA_EXT_ZFA 71 #define RISCV_ISA_EXT_ZTSO 72 #define RISCV_ISA_EXT_ZACAS 73 -#define RISCV_ISA_EXT_XANDESPMU 74 -#define RISCV_ISA_EXT_ZVE32X 75 -#define RISCV_ISA_EXT_ZVE32F 76 -#define RISCV_ISA_EXT_ZVE64X 77 -#define RISCV_ISA_EXT_ZVE64F 78 -#define RISCV_ISA_EXT_ZVE64D 79 -#define RISCV_ISA_EXT_ZIMOP 80 -#define RISCV_ISA_EXT_ZCA 81 -#define RISCV_ISA_EXT_ZCB 82 -#define RISCV_ISA_EXT_ZCD 83 -#define RISCV_ISA_EXT_ZCF 84 -#define RISCV_ISA_EXT_ZCMOP 85 -#define RISCV_ISA_EXT_ZAWRS 86 +#define RISCV_ISA_EXT_ZVE32X 74 +#define RISCV_ISA_EXT_ZVE32F 75 +#define RISCV_ISA_EXT_ZVE64X 76 +#define RISCV_ISA_EXT_ZVE64F 77 +#define RISCV_ISA_EXT_ZVE64D 78 +#define RISCV_ISA_EXT_ZIMOP 79 +#define RISCV_ISA_EXT_ZCA 80 +#define RISCV_ISA_EXT_ZCB 81 +#define RISCV_ISA_EXT_ZCD 82 +#define RISCV_ISA_EXT_ZCF 83 +#define RISCV_ISA_EXT_ZCMOP 84 +#define RISCV_ISA_EXT_ZAWRS 85 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/include/asm/vendor_extensions.h b/arch/riscv/include/asm/vendor_extensions.h new file mode 100644 index 000000000000..5fca550fc1f6 --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2024 Rivos, Inc + */ + +#ifndef _ASM_VENDOR_EXTENSIONS_H +#define _ASM_VENDOR_EXTENSIONS_H + +#include + +#include +#include + +/* + * The extension keys of each vendor must be strictly less than this value. + */ +#define RISCV_ISA_VENDOR_EXT_MAX 32 + +struct riscv_isavendorinfo { + DECLARE_BITMAP(isa, RISCV_ISA_VENDOR_EXT_MAX); +}; + +struct riscv_isa_vendor_ext_data_list { + bool is_initialized; + const size_t ext_data_count; + const struct riscv_isa_ext_data *ext_data; + struct riscv_isavendorinfo per_hart_isa_bitmap[NR_CPUS]; + struct riscv_isavendorinfo all_harts_isa_bitmap; +}; + +extern struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[]; + +extern const size_t riscv_isa_vendor_ext_list_size; + +/* + * The alternatives need some way of distinguishing between vendor extensions + * and errata. Incrementing all of the vendor extension keys so they are at + * least 0x8000 accomplishes that. + */ +#define RISCV_VENDOR_EXT_ALTERNATIVES_BASE 0x8000 + +#define VENDOR_EXT_ALL_CPUS -1 + +bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit); +#define riscv_isa_vendor_extension_available(vendor, ext) \ + __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, \ + RISCV_ISA_VENDOR_EXT_##ext) + +#endif /* _ASM_VENDOR_EXTENSIONS_H */ diff --git a/arch/riscv/include/asm/vendor_extensions/andes.h b/arch/riscv/include/asm/vendor_extensions/andes.h new file mode 100644 index 000000000000..7bb2fc43438f --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/andes.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_ANDES_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_ANDES_H + +#include + +#include + +#define RISCV_ISA_VENDOR_EXT_XANDESPMU 0 + +/* + * Extension keys should be strictly less than max. + * It is safe to increment this when necessary. + */ +#define RISCV_ISA_VENDOR_EXT_MAX_ANDES 32 + +extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_andes; + +#endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 5b243d46f4b1..b0aea202273d 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -58,6 +58,8 @@ obj-y += riscv_ksyms.o obj-y += stacktrace.o obj-y += cacheinfo.o obj-y += patch.o +obj-y += vendor_extensions.o +obj-y += vendor_extensions/ obj-y += probes/ obj-y += tests/ obj-$(CONFIG_MMU) += vdso.o vdso/ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 0366dc3baf33..8f20607adb40 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -24,6 +24,7 @@ #include #include #include +#include #define NUM_ALPHA_EXTS ('z' - 'a' + 1) @@ -100,31 +101,6 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data, return 0; } -#define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, _validate) { \ - .name = #_name, \ - .property = #_name, \ - .id = _id, \ - .subset_ext_ids = _subset_exts, \ - .subset_ext_size = _subset_exts_size, \ - .validate = _validate \ -} - -#define __RISCV_ISA_EXT_DATA(_name, _id) _RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, NULL) - -#define __RISCV_ISA_EXT_DATA_VALIDATE(_name, _id, _validate) \ - _RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, _validate) - -/* Used to declare pure "lasso" extension (Zk for instance) */ -#define __RISCV_ISA_EXT_BUNDLE(_name, _bundled_exts) \ - _RISCV_ISA_EXT_DATA(_name, RISCV_ISA_EXT_INVALID, _bundled_exts, \ - ARRAY_SIZE(_bundled_exts), NULL) - -/* Used to declare extensions that are a superset of other extensions (Zvbb for instance) */ -#define __RISCV_ISA_EXT_SUPERSET(_name, _id, _sub_exts) \ - _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), NULL) -#define __RISCV_ISA_EXT_SUPERSET_VALIDATE(_name, _id, _sub_exts, _validate) \ - _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate) - static int riscv_ext_zca_depends(const struct riscv_isa_ext_data *data, const unsigned long *isa_bitmap) { @@ -405,7 +381,6 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), - __RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_EXT_XANDESPMU), }; const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext); @@ -512,6 +487,21 @@ static void __init riscv_parse_isa_string(const char *isa, unsigned long *bitmap bool ext_err = false; switch (*ext) { + case 'x': + case 'X': + if (acpi_disabled) + pr_warn_once("Vendor extensions are ignored in riscv,isa. Use riscv,isa-extensions instead."); + /* + * To skip an extension, we find its end. + * As multi-letter extensions must be split from other multi-letter + * extensions with an "_", the end of a multi-letter extension will + * either be the null character or the "_" at the start of the next + * multi-letter extension. + */ + for (; *isa && *isa != '_'; ++isa) + ; + ext_err = true; + break; case 's': /* * Workaround for invalid single-letter 's' & 'u' (QEMU). @@ -527,8 +517,6 @@ static void __init riscv_parse_isa_string(const char *isa, unsigned long *bitmap } fallthrough; case 'S': - case 'x': - case 'X': case 'z': case 'Z': /* @@ -728,6 +716,61 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) acpi_put_table((struct acpi_table_header *)rhct); } +static void __init riscv_fill_cpu_vendor_ext(struct device_node *cpu_node, int cpu) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return; + + for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) { + struct riscv_isa_vendor_ext_data_list *ext_list = riscv_isa_vendor_ext_list[i]; + + for (int j = 0; j < ext_list->ext_data_count; j++) { + const struct riscv_isa_ext_data ext = ext_list->ext_data[j]; + struct riscv_isavendorinfo *isavendorinfo = &ext_list->per_hart_isa_bitmap[cpu]; + + if (of_property_match_string(cpu_node, "riscv,isa-extensions", + ext.property) < 0) + continue; + + /* + * Assume that subset extensions are all members of the + * same vendor. + */ + if (ext.subset_ext_size) + for (int k = 0; k < ext.subset_ext_size; k++) + set_bit(ext.subset_ext_ids[k], isavendorinfo->isa); + + set_bit(ext.id, isavendorinfo->isa); + } + } +} + +/* + * Populate all_harts_isa_bitmap for each vendor with all of the extensions that + * are shared across CPUs for that vendor. + */ +static void __init riscv_fill_vendor_ext_list(int cpu) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return; + + for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) { + struct riscv_isa_vendor_ext_data_list *ext_list = riscv_isa_vendor_ext_list[i]; + + if (!ext_list->is_initialized) { + bitmap_copy(ext_list->all_harts_isa_bitmap.isa, + ext_list->per_hart_isa_bitmap[cpu].isa, + RISCV_ISA_VENDOR_EXT_MAX); + ext_list->is_initialized = true; + } else { + bitmap_and(ext_list->all_harts_isa_bitmap.isa, + ext_list->all_harts_isa_bitmap.isa, + ext_list->per_hart_isa_bitmap[cpu].isa, + RISCV_ISA_VENDOR_EXT_MAX); + } + } +} + static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap) { unsigned int cpu; @@ -760,6 +803,7 @@ static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap) } riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap); + riscv_fill_cpu_vendor_ext(cpu_node, cpu); of_node_put(cpu_node); @@ -776,6 +820,8 @@ static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap) bitmap_copy(riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); else bitmap_and(riscv_isa, riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); + + riscv_fill_vendor_ext_list(cpu); } if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX)) @@ -918,28 +964,45 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin, { struct alt_entry *alt; void *oldptr, *altptr; - u16 id, value; + u16 id, value, vendor; if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) return; for (alt = begin; alt < end; alt++) { - if (alt->vendor_id != 0) - continue; - id = PATCH_ID_CPUFEATURE_ID(alt->patch_id); + vendor = PATCH_ID_CPUFEATURE_ID(alt->vendor_id); - if (id >= RISCV_ISA_EXT_MAX) { - WARN(1, "This extension id:%d is not in ISA extension list", id); - continue; - } + /* + * Any alternative with a patch_id that is less than + * RISCV_ISA_EXT_MAX is interpreted as a standard extension. + * + * Any alternative with patch_id that is greater than or equal + * to RISCV_VENDOR_EXT_ALTERNATIVES_BASE is interpreted as a + * vendor extension. + */ + if (id < RISCV_ISA_EXT_MAX) { + /* + * This patch should be treated as errata so skip + * processing here. + */ + if (alt->vendor_id != 0) + continue; - if (!__riscv_isa_extension_available(NULL, id)) - continue; + if (!__riscv_isa_extension_available(NULL, id)) + continue; - value = PATCH_ID_CPUFEATURE_VALUE(alt->patch_id); - if (!riscv_cpufeature_patch_check(id, value)) + value = PATCH_ID_CPUFEATURE_VALUE(alt->patch_id); + if (!riscv_cpufeature_patch_check(id, value)) + continue; + } else if (id >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE) { + if (!__riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, + id - RISCV_VENDOR_EXT_ALTERNATIVES_BASE)) + continue; + } else { + WARN(1, "This extension id:%d is not in ISA extension list", id); continue; + } oldptr = ALT_OLD_PTR(alt); altptr = ALT_ALT_PTR(alt); diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c new file mode 100644 index 000000000000..b6c1e7b5d34b --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Rivos, Inc + */ + +#include +#include +#include + +#include +#include + +struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = { +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES + &riscv_isa_vendor_ext_list_andes, +#endif +}; + +const size_t riscv_isa_vendor_ext_list_size = ARRAY_SIZE(riscv_isa_vendor_ext_list); + +/** + * __riscv_isa_vendor_extension_available() - Check whether given vendor + * extension is available or not. + * + * @cpu: check if extension is available on this cpu + * @vendor: vendor that the extension is a member of + * @bit: bit position of the desired extension + * Return: true or false + * + * NOTE: When cpu is -1, will check if extension is available on all cpus + */ +bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit) +{ + struct riscv_isavendorinfo *bmap; + struct riscv_isavendorinfo *cpu_bmap; + + switch (vendor) { + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES + case ANDES_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_andes.all_harts_isa_bitmap; + cpu_bmap = &riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap[cpu]; + break; + #endif + default: + return false; + } + + if (cpu != -1) + bmap = &cpu_bmap[cpu]; + + if (bit >= RISCV_ISA_VENDOR_EXT_MAX) + return false; + + return test_bit(bit, bmap->isa) ? true : false; +} +EXPORT_SYMBOL_GPL(__riscv_isa_vendor_extension_available); diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile new file mode 100644 index 000000000000..6a61aed944f1 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o diff --git a/arch/riscv/kernel/vendor_extensions/andes.c b/arch/riscv/kernel/vendor_extensions/andes.c new file mode 100644 index 000000000000..ec688c88456a --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/andes.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include +#include + +/* All Andes vendor extensions supported in Linux */ +const struct riscv_isa_ext_data riscv_isa_vendor_ext_andes[] = { + __RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_VENDOR_EXT_XANDESPMU), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_andes = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_andes), + .ext_data = riscv_isa_vendor_ext_andes, +}; diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index a2e4005e1fd0..02719e0c6368 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #define ALT_SBI_PMU_OVERFLOW(__ovl) \ asm volatile(ALTERNATIVE_2( \ @@ -32,7 +34,8 @@ asm volatile(ALTERNATIVE_2( \ THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \ CONFIG_ERRATA_THEAD_PMU, \ "csrr %0, " __stringify(ANDES_CSR_SCOUNTEROF), \ - 0, RISCV_ISA_EXT_XANDESPMU, \ + ANDES_VENDOR_ID, \ + RISCV_ISA_VENDOR_EXT_XANDESPMU + RISCV_VENDOR_EXT_ALTERNATIVES_BASE, \ CONFIG_ANDES_CUSTOM_PMU) \ : "=r" (__ovl) : \ : "memory") @@ -41,7 +44,8 @@ asm volatile(ALTERNATIVE_2( \ asm volatile(ALTERNATIVE( \ "csrc " __stringify(CSR_IP) ", %0\n\t", \ "csrc " __stringify(ANDES_CSR_SLIP) ", %0\n\t", \ - 0, RISCV_ISA_EXT_XANDESPMU, \ + ANDES_VENDOR_ID, \ + RISCV_ISA_VENDOR_EXT_XANDESPMU + RISCV_VENDOR_EXT_ALTERNATIVES_BASE, \ CONFIG_ANDES_CUSTOM_PMU) \ : : "r"(__irq_mask) \ : "memory") @@ -1060,7 +1064,7 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde riscv_cached_mimpid(0) == 0) { riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU; riscv_pmu_use_irq = true; - } else if (riscv_isa_extension_available(NULL, XANDESPMU) && + } else if (riscv_isa_vendor_extension_available(ANDES_VENDOR_ID, XANDESPMU) && IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) { riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI; riscv_pmu_use_irq = true; -- cgit From 9448d9accdd8f1483df6fe6692e3c50cfe507d7a Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 19 Jul 2024 09:15:19 -0700 Subject: riscv: Add vendor extensions to /proc/cpuinfo All of the supported vendor extensions that have been listed in riscv_isa_vendor_ext_list can be exported through /proc/cpuinfo. Signed-off-by: Charlie Jenkins Reviewed-by: Evan Green Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-2-0af7587bbec0@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index c1f3655238fd..f6b13e9f5e6c 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -16,6 +16,7 @@ #include #include #include +#include bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { @@ -235,7 +236,33 @@ arch_initcall(riscv_cpuinfo_init); #ifdef CONFIG_PROC_FS -static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap) +#define ALL_CPUS -1 + +static void print_vendor_isa(struct seq_file *f, int cpu) +{ + struct riscv_isavendorinfo *vendor_bitmap; + struct riscv_isa_vendor_ext_data_list *ext_list; + const struct riscv_isa_ext_data *ext_data; + + for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) { + ext_list = riscv_isa_vendor_ext_list[i]; + ext_data = riscv_isa_vendor_ext_list[i]->ext_data; + + if (cpu == ALL_CPUS) + vendor_bitmap = &ext_list->all_harts_isa_bitmap; + else + vendor_bitmap = &ext_list->per_hart_isa_bitmap[cpu]; + + for (int j = 0; j < ext_list->ext_data_count; j++) { + if (!__riscv_isa_extension_available(vendor_bitmap->isa, ext_data[j].id)) + continue; + + seq_printf(f, "_%s", ext_data[j].name); + } + } +} + +static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap, int cpu) { if (IS_ENABLED(CONFIG_32BIT)) @@ -254,6 +281,8 @@ static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap) seq_printf(f, "%s", riscv_isa_ext[i].name); } + print_vendor_isa(f, cpu); + seq_puts(f, "\n"); } @@ -316,7 +345,7 @@ static int c_show(struct seq_file *m, void *v) * line. */ seq_puts(m, "isa\t\t: "); - print_isa(m, NULL); + print_isa(m, NULL, ALL_CPUS); print_mmu(m); if (acpi_disabled) { @@ -338,7 +367,7 @@ static int c_show(struct seq_file *m, void *v) * additional extensions not present across all harts. */ seq_puts(m, "hart isa\t: "); - print_isa(m, hart_isa[cpu_id].isa); + print_isa(m, hart_isa[cpu_id].isa, cpu_id); seq_puts(m, "\n"); return 0; -- cgit From 0f2425411101128a38ad0166268975ca11fbcaeb Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 19 Jul 2024 09:15:20 -0700 Subject: riscv: Introduce vendor variants of extension helpers Vendor extensions are maintained in per-vendor structs (separate from standard extensions which live in riscv_isa). Create vendor variants for the existing extension helpers to interface with the riscv_isa_vendor bitmaps. Signed-off-by: Charlie Jenkins Reviewed-by: Conor Dooley Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-3-0af7587bbec0@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vendor_extensions.h | 83 ++++++++++++++++++++++++++++++ drivers/perf/riscv_pmu_sbi.c | 3 +- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/vendor_extensions.h b/arch/riscv/include/asm/vendor_extensions.h index 5fca550fc1f6..04d72b02ae6b 100644 --- a/arch/riscv/include/asm/vendor_extensions.h +++ b/arch/riscv/include/asm/vendor_extensions.h @@ -42,8 +42,91 @@ extern const size_t riscv_isa_vendor_ext_list_size; #define VENDOR_EXT_ALL_CPUS -1 bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit); +#define riscv_cpu_isa_vendor_extension_available(cpu, vendor, ext) \ + __riscv_isa_vendor_extension_available(cpu, vendor, RISCV_ISA_VENDOR_EXT_##ext) #define riscv_isa_vendor_extension_available(vendor, ext) \ __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, \ RISCV_ISA_VENDOR_EXT_##ext) +static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor, + const unsigned long ext) +{ + asm goto(ALTERNATIVE("j %l[l_no]", "nop", %[vendor], %[ext], 1) + : + : [vendor] "i" (vendor), [ext] "i" (ext) + : + : l_no); + + return true; +l_no: + return false; +} + +static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor, + const unsigned long ext) +{ + asm goto(ALTERNATIVE("nop", "j %l[l_yes]", %[vendor], %[ext], 1) + : + : [vendor] "i" (vendor), [ext] "i" (ext) + : + : l_yes); + + return false; +l_yes: + return true; +} + +static __always_inline bool riscv_has_vendor_extension_likely(const unsigned long vendor, + const unsigned long ext) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return false; + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) + return __riscv_has_extension_likely(vendor, + ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + + return __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, ext); +} + +static __always_inline bool riscv_has_vendor_extension_unlikely(const unsigned long vendor, + const unsigned long ext) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return false; + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) + return __riscv_has_extension_unlikely(vendor, + ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + + return __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, ext); +} + +static __always_inline bool riscv_cpu_has_vendor_extension_likely(const unsigned long vendor, + int cpu, const unsigned long ext) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return false; + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && + __riscv_has_extension_likely(vendor, ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE)) + return true; + + return __riscv_isa_vendor_extension_available(cpu, vendor, ext); +} + +static __always_inline bool riscv_cpu_has_vendor_extension_unlikely(const unsigned long vendor, + int cpu, + const unsigned long ext) +{ + if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT)) + return false; + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && + __riscv_has_extension_unlikely(vendor, ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE)) + return true; + + return __riscv_isa_vendor_extension_available(cpu, vendor, ext); +} + #endif /* _ASM_VENDOR_EXTENSIONS_H */ diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 02719e0c6368..e1f0edfb7dcc 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1064,7 +1064,8 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde riscv_cached_mimpid(0) == 0) { riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU; riscv_pmu_use_irq = true; - } else if (riscv_isa_vendor_extension_available(ANDES_VENDOR_ID, XANDESPMU) && + } else if (riscv_has_vendor_extension_unlikely(ANDES_VENDOR_ID, + RISCV_ISA_VENDOR_EXT_XANDESPMU) && IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) { riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI; riscv_pmu_use_irq = true; -- cgit From d4c8d79f5199055da38f880f782a3e62c599ff5d Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 19 Jul 2024 09:15:21 -0700 Subject: riscv: cpufeature: Extract common elements from extension checking The __riscv_has_extension_likely() and __riscv_has_extension_unlikely() functions from the vendor_extensions.h can be used to simplify the standard extension checking code as well. Migrate those functions to cpufeature.h and reorganize the code in the file to use the functions. Signed-off-by: Charlie Jenkins Reviewed-by: Conor Dooley Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-4-0af7587bbec0@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 78 +++++++++++++++++------------- arch/riscv/include/asm/vendor_extensions.h | 28 ----------- 2 files changed, 44 insertions(+), 62 deletions(-) diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index bfe7c0b881e9..45f9c1171a48 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -104,59 +104,66 @@ extern bool riscv_isa_fallback; unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); +#define STANDARD_EXT 0 + bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, unsigned int bit); #define riscv_isa_extension_available(isa_bitmap, ext) \ __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext) -static __always_inline bool -riscv_has_extension_likely(const unsigned long ext) +static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor, + const unsigned long ext) { - compiletime_assert(ext < RISCV_ISA_EXT_MAX, - "ext must be < RISCV_ISA_EXT_MAX"); - - if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { - asm goto( - ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1) - : - : [ext] "i" (ext) - : - : l_no); - } else { - if (!__riscv_isa_extension_available(NULL, ext)) - goto l_no; - } + asm goto(ALTERNATIVE("j %l[l_no]", "nop", %[vendor], %[ext], 1) + : + : [vendor] "i" (vendor), [ext] "i" (ext) + : + : l_no); return true; l_no: return false; } -static __always_inline bool -riscv_has_extension_unlikely(const unsigned long ext) +static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor, + const unsigned long ext) { - compiletime_assert(ext < RISCV_ISA_EXT_MAX, - "ext must be < RISCV_ISA_EXT_MAX"); - - if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { - asm goto( - ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1) - : - : [ext] "i" (ext) - : - : l_yes); - } else { - if (__riscv_isa_extension_available(NULL, ext)) - goto l_yes; - } + asm goto(ALTERNATIVE("nop", "j %l[l_yes]", %[vendor], %[ext], 1) + : + : [vendor] "i" (vendor), [ext] "i" (ext) + : + : l_yes); return false; l_yes: return true; } +static __always_inline bool riscv_has_extension_unlikely(const unsigned long ext) +{ + compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) + return __riscv_has_extension_unlikely(STANDARD_EXT, ext); + + return __riscv_isa_extension_available(NULL, ext); +} + +static __always_inline bool riscv_has_extension_likely(const unsigned long ext) +{ + compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) + return __riscv_has_extension_likely(STANDARD_EXT, ext); + + return __riscv_isa_extension_available(NULL, ext); +} + static __always_inline bool riscv_cpu_has_extension_likely(int cpu, const unsigned long ext) { - if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && riscv_has_extension_likely(ext)) + compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && + __riscv_has_extension_likely(STANDARD_EXT, ext)) return true; return __riscv_isa_extension_available(hart_isa[cpu].isa, ext); @@ -164,7 +171,10 @@ static __always_inline bool riscv_cpu_has_extension_likely(int cpu, const unsign static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsigned long ext) { - if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && riscv_has_extension_unlikely(ext)) + compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && + __riscv_has_extension_unlikely(STANDARD_EXT, ext)) return true; return __riscv_isa_extension_available(hart_isa[cpu].isa, ext); diff --git a/arch/riscv/include/asm/vendor_extensions.h b/arch/riscv/include/asm/vendor_extensions.h index 04d72b02ae6b..7437304a71b9 100644 --- a/arch/riscv/include/asm/vendor_extensions.h +++ b/arch/riscv/include/asm/vendor_extensions.h @@ -48,34 +48,6 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, \ RISCV_ISA_VENDOR_EXT_##ext) -static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor, - const unsigned long ext) -{ - asm goto(ALTERNATIVE("j %l[l_no]", "nop", %[vendor], %[ext], 1) - : - : [vendor] "i" (vendor), [ext] "i" (ext) - : - : l_no); - - return true; -l_no: - return false; -} - -static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor, - const unsigned long ext) -{ - asm goto(ALTERNATIVE("nop", "j %l[l_yes]", %[vendor], %[ext], 1) - : - : [vendor] "i" (vendor), [ext] "i" (ext) - : - : l_yes); - - return false; -l_yes: - return true; -} - static __always_inline bool riscv_has_vendor_extension_likely(const unsigned long vendor, const unsigned long ext) { -- cgit From 65d284a38c0ded391ae04a29640c34221c934cd2 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 6 Jun 2024 18:41:57 +0200 Subject: ceph: use cap_wait_list only if debugfs is enabled Only debugfs uses this list. By omitting it, we save some memory and reduce lock contention on `caps_list_lock`. Signed-off-by: Max Kellermann Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 ++++++ fs/ceph/mds_client.c | 2 ++ fs/ceph/mds_client.h | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c4941ba245ac..e98aa8219303 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3067,10 +3067,13 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { +#ifdef CONFIG_DEBUG_FS struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; +#endif DEFINE_WAIT_FUNC(wait, woken_wake_function); +#ifdef CONFIG_DEBUG_FS cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; @@ -3079,6 +3082,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); +#endif /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); @@ -3097,9 +3101,11 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); +#ifdef CONFIG_DEBUG_FS spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); +#endif if (ret == -EAGAIN) continue; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c2157f6e0c69..62238f3e6e19 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5505,7 +5505,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); +#ifdef CONFIG_DEBUG_FS INIT_LIST_HEAD(&mdsc->cap_wait_list); +#endif spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); INIT_LIST_HEAD(&mdsc->snap_flush_list); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cfa18cf915a0..9bcc7f181bfe 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -416,6 +416,8 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +#ifdef CONFIG_DEBUG_FS + struct cap_wait { struct list_head list; u64 ino; @@ -424,6 +426,8 @@ struct cap_wait { int want; }; +#endif + enum { CEPH_MDSC_STOPPING_BEGIN = 1, CEPH_MDSC_STOPPING_FLUSHING = 2, @@ -512,7 +516,9 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ +#ifdef CONFIG_DEBUG_FS struct list_head cap_wait_list; +#endif int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ -- cgit From 77bb4a501a7756e6f5428b72fb0e420deb7ae562 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 9 Jul 2024 14:44:00 +0800 Subject: ceph: convert comma to semicolon in __ceph_dentry_dir_lease_touch() Replace a comma between expression statements by a semicolon. Signed-off-by: Chen Ni Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82a2e2a06a65..e4e7a856e9c2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1589,7 +1589,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) } spin_lock(&mdsc->dentry_list_lock); - __dentry_dir_lease_touch(mdsc, di), + __dentry_dir_lease_touch(mdsc, di); spin_unlock(&mdsc->dentry_list_lock); } -- cgit From 578eb54c4a16713d99eed736660e30ae6eb1877f Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 10 Jul 2024 20:16:54 +0800 Subject: ceph: periodically flush the cap releases The MDS could be waiting the caps releases infinitely in some corner case and then reporting the caps revoke stuck warning. To fix this we should periodically flush the cap releases. Link: https://tracker.ceph.com/issues/57244 Signed-off-by: Xiubo Li Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 62238f3e6e19..276e34ab3e2c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5446,6 +5446,8 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + ceph_flush_cap_releases(mdsc, s); + mutex_lock(&s->s_mutex); if (renew_caps) send_renew_caps(mdsc, s); -- cgit From 03230edb0bd831662a7c08b6fef66b2a9a817774 Mon Sep 17 00:00:00 2001 From: ethanwu Date: Thu, 11 Jul 2024 14:47:56 +0800 Subject: ceph: fix incorrect kmalloc size of pagevec mempool The kmalloc size of pagevec mempool is incorrectly calculated. It misses the size of page pointer and only accounts the number for the array. Fixes: a0102bda5bc0 ("ceph: move sb->wb_pagevec_pool to be a global mempool") Signed-off-by: ethanwu Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 885cb5d4e771..0cdf84cd1791 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -961,7 +961,8 @@ static int __init init_caches(void) if (!ceph_mds_request_cachep) goto bad_mds_req; - ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, + (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *)); if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; -- cgit From 65564157ae64cec0f527583f96e32f484f730f92 Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Thu, 11 Jul 2024 22:02:08 +0530 Subject: drm/i915/gt: Do not consider preemption during execlists_dequeue for gen8 We're seeing a GPU hang issue on a CHV platform, which was caused by commit bac24f59f454 ("drm/i915/execlists: Enable coarse preemption boundaries for Gen8"). The Gen8 platform only supports timeslicing and doesn't have a preemption mechanism, as its engines do not have a preemption timer. Commit 751f82b353a6 ("drm/i915/gt: Only disable preemption on Gen8 render engines") addressed this issue only for render engines. This patch extends that fix by ensuring that preemption is not considered for all engines on Gen8 platforms. v4: - Use the correct Fixes tag (Rodrigo Vivi) - Reworded commit log (Andi Shyti) v3: - Inside need_preempt(), condition of can_preempt() is not required as simplified can_preempt() is enough. (Chris Wilson) v2: Simplify can_preempt() function (Tvrtko Ursulin) Fixes: 751f82b353a6 ("drm/i915/gt: Only disable preemption on gen8 render engines") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11396 Suggested-by: Andi Shyti Signed-off-by: Nitin Gote Cc: Chris Wilson CC: # v5.12+ Reviewed-by: Jonathan Cavitt Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240711163208.1355736-1-nitin.r.gote@intel.com (cherry picked from commit 7df0be6e6280c6fca01d039864bb123e5e36604b) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 21829439e686..72090f52fb85 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -3315,11 +3315,7 @@ static void remove_from_engine(struct i915_request *rq) static bool can_preempt(struct intel_engine_cs *engine) { - if (GRAPHICS_VER(engine->i915) > 8) - return true; - - /* GPGPU on bdw requires extra w/a; not implemented */ - return engine->class != RENDER_CLASS; + return GRAPHICS_VER(engine->i915) > 8; } static void kick_execlists(const struct i915_request *rq, int prio) -- cgit From 26720dd2b5a1d088bff8f7e6355fca021c83718f Mon Sep 17 00:00:00 2001 From: Jonathan Cavitt Date: Fri, 12 Jul 2024 14:41:56 -0700 Subject: drm/i915: Allow NULL memory region Prevent a NULL pointer access in intel_memory_regions_hw_probe. Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") Reported-by: Dan Carpenter Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11704 Signed-off-by: Jonathan Cavitt Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240712214156.3969584-1-jonathan.cavitt@intel.com (cherry picked from commit d75dec1fcbcb05b021c08b62551649567ab8955c) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_memory_region.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 172dfa7c3588..d40ee1b42110 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915) goto out_cleanup; } - mem->id = i; - i915->mm.regions[i] = mem; + if (mem) { /* Skip on non-fatal errors */ + mem->id = i; + i915->mm.regions[i] = mem; + } } for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) { -- cgit From 368990a7fe30737c990f628a60d26d9854a9e690 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 15 Jul 2024 09:47:08 +0200 Subject: xen: fix multicall debug data referencing The recent adding of multicall debug mixed up the referencing of the debug data. A __percpu tagged pointer can't be initialized with a plain pointer, so use another percpu variable for the pointer and set it on each new cpu via a function. Fixes: 942d917cb92a ("xen: make multicall debug boot time selectable") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407151106.5s7Mnfpz-lkp@intel.com/ Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- arch/x86/xen/multicalls.c | 19 ++++++++++++------- arch/x86/xen/smp_pv.c | 1 + arch/x86/xen/xen-ops.h | 3 +++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index d4cefd8a9af4..10c660fae8b3 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -54,8 +54,9 @@ struct mc_debug_data { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); static struct mc_debug_data mc_debug_data_early __initdata; -static struct mc_debug_data __percpu *mc_debug_data __refdata = +static DEFINE_PER_CPU(struct mc_debug_data *, mc_debug_data) = &mc_debug_data_early; +static struct mc_debug_data __percpu *mc_debug_data_ptr; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); static struct static_key mc_debug __ro_after_init; @@ -70,16 +71,20 @@ static int __init xen_parse_mc_debug(char *arg) } early_param("xen_mc_debug", xen_parse_mc_debug); +void mc_percpu_init(unsigned int cpu) +{ + per_cpu(mc_debug_data, cpu) = per_cpu_ptr(mc_debug_data_ptr, cpu); +} + static int __init mc_debug_enable(void) { - struct mc_debug_data __percpu *mcdb; unsigned long flags; if (!mc_debug_enabled) return 0; - mcdb = alloc_percpu(struct mc_debug_data); - if (!mcdb) { + mc_debug_data_ptr = alloc_percpu(struct mc_debug_data); + if (!mc_debug_data_ptr) { pr_err("xen_mc_debug inactive\n"); static_key_slow_dec(&mc_debug); return -ENOMEM; @@ -88,7 +93,7 @@ static int __init mc_debug_enable(void) /* Be careful when switching to percpu debug data. */ local_irq_save(flags); xen_mc_flush(); - mc_debug_data = mcdb; + mc_percpu_init(0); local_irq_restore(flags); pr_info("xen_mc_debug active\n"); @@ -150,7 +155,7 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); if (static_key_false(&mc_debug)) { - mcdb = this_cpu_ptr(mc_debug_data); + mcdb = __this_cpu_read(mc_debug_data); memcpy(mcdb->entries, b->entries, b->mcidx * sizeof(struct multicall_entry)); } @@ -230,7 +235,7 @@ struct multicall_space __xen_mc_entry(size_t args) ret.mc = &b->entries[b->mcidx]; if (static_key_false(&mc_debug)) { - struct mc_debug_data *mcdb = this_cpu_ptr(mc_debug_data); + struct mc_debug_data *mcdb = __this_cpu_read(mc_debug_data); mcdb->caller[b->mcidx] = __builtin_return_address(0); mcdb->argsz[b->mcidx] = args; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 7ea57f728b89..6863d3da7dec 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -305,6 +305,7 @@ static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle) return rc; xen_pmu_init(cpu); + mc_percpu_init(cpu); /* * Why is this a BUG? If the hypercall fails then everything can be diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e7775dff9452..0cf16fc79e0b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -257,6 +257,9 @@ void xen_mc_callback(void (*fn)(void *), void *data); */ struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); +/* Do percpu data initialization for multicalls. */ +void mc_percpu_init(unsigned int cpu); + extern bool is_xen_pmu; irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); -- cgit From e5f98896efb3b6350cb6f1c241394966dcbcf240 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Jul 2024 21:00:35 +0200 Subject: thermal: trip: Split thermal_zone_device_set_mode() Pull a wrapper around thermal zone .change_mode() callback out of thermal_zone_device_set_mode() because it will be used elsewhere subsequently. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2206793.irdbgypaU6@rjwysocki.net --- drivers/thermal/thermal_core.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index f6e700e48aad..48bf267d090d 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -272,6 +272,22 @@ static int __init thermal_register_governors(void) return ret; } +static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz, + enum thermal_device_mode mode) +{ + if (tz->ops.change_mode) { + int ret; + + ret = tz->ops.change_mode(tz, mode); + if (ret) + return ret; + } + + tz->mode = mode; + + return 0; +} + /* * Zone update section: main control loop applied to each zone while monitoring * in polling mode. The monitoring is done using a workqueue. @@ -540,7 +556,7 @@ monitor: static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { - int ret = 0; + int ret; mutex_lock(&tz->lock); @@ -548,14 +564,15 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, if (mode == tz->mode) { mutex_unlock(&tz->lock); - return ret; + return 0; } - if (tz->ops.change_mode) - ret = tz->ops.change_mode(tz, mode); + ret = __thermal_zone_device_set_mode(tz, mode); + if (ret) { + mutex_unlock(&tz->lock); - if (!ret) - tz->mode = mode; + return ret; + } __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); @@ -566,7 +583,7 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, else thermal_notify_tz_disable(tz); - return ret; + return 0; } int thermal_zone_device_enable(struct thermal_zone_device *tz) -- cgit From b8a4518b5c2d1164f9bb2e586733a658c5239adf Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 23 Jul 2024 10:41:52 +0100 Subject: drbd: Add peer_device to Kernel doc Add missing documentation of peer_device parameter to Kernel doc. These parameters were added in commit 8164dd6c8ae1 ("drbd: Add peer device parameter to whole-bitmap I/O handlers") Flagged by W=1 builds. Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20240723-drbd-doc-v1-1-a04d9b7a9688@kernel.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f92673f05c7a..a9e49b212341 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3422,6 +3422,7 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) /** * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3448,6 +3449,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device, /** * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3501,6 +3503,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * @done: callback to be called after the bitmap IO was performed * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * While IO on the bitmap happens we freeze application IO thus we ensure * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be @@ -3549,6 +3552,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * @io_fn: IO callback to be called when bitmap IO is possible * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. -- cgit From 57e29f4c8919af7276c35c2da0ae5efb4c36c33e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 15 Jul 2024 10:33:06 +0200 Subject: s390: Add runtime constant support Implement the runtime constant infrastructure for s390, allowing the dcache d_hash() function to be generated using as a constant for hash table address followed by shift by a constant of the hash index. This is the s390 variant of commit 94a2bc0f611c ("arm64: add 'runtime constant' support") and commit e3c92e81711d ("runtime constants: add x86 architecture support"). Signed-off-by: Heiko Carstens Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/runtime-const.h | 77 +++++++++++++++++++++++++++++++++++ arch/s390/kernel/vmlinux.lds.S | 3 ++ 2 files changed, 80 insertions(+) create mode 100644 arch/s390/include/asm/runtime-const.h diff --git a/arch/s390/include/asm/runtime-const.h b/arch/s390/include/asm/runtime-const.h new file mode 100644 index 000000000000..17878b1d048c --- /dev/null +++ b/arch/s390/include/asm/runtime-const.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_RUNTIME_CONST_H +#define _ASM_S390_RUNTIME_CONST_H + +#include + +#define runtime_const_ptr(sym) \ +({ \ + typeof(sym) __ret; \ + \ + asm_inline( \ + "0: iihf %[__ret],%[c1]\n" \ + " iilf %[__ret],%[c2]\n" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n" \ + ".long 0b - .\n" \ + ".popsection" \ + : [__ret] "=d" (__ret) \ + : [c1] "i" (0x01234567UL), \ + [c2] "i" (0x89abcdefUL)); \ + __ret; \ +}) + +#define runtime_const_shift_right_32(val, sym) \ +({ \ + unsigned int __ret = (val); \ + \ + asm_inline( \ + "0: srl %[__ret],12\n" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n" \ + ".long 0b - .\n" \ + ".popsection" \ + : [__ret] "+d" (__ret)); \ + __ret; \ +}) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +/* 32-bit immediate for iihf and iilf in bits in I2 field */ +static inline void __runtime_fixup_32(u32 *p, unsigned int val) +{ + s390_kernel_write(p, &val, sizeof(val)); +} + +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + __runtime_fixup_32(where + 2, val >> 32); + __runtime_fixup_32(where + 8, val); +} + +/* Immediate value is lower 12 bits of D2 field of srl */ +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + u32 insn = *(u32 *)where; + + insn &= 0xfffff000; + insn |= (val & 63); + s390_kernel_write(where, &insn, sizeof(insn)); +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif /* _ASM_S390_RUNTIME_CONST_H */ diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index a1ce3925ec71..5128ccee9c67 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -190,6 +190,9 @@ SECTIONS . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) + RUNTIME_CONST(shift, d_hash_shift) + RUNTIME_CONST(ptr, dentry_hashtable) + PERCPU_SECTION(0x100) . = ALIGN(PAGE_SIZE); -- cgit From 5fd11b96b43708f2f6e3964412c301c1bd20ec0f Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Thu, 11 Jul 2024 15:45:26 +0200 Subject: s390/pci: Refactor arch_setup_msi_irqs() Factor out adapter interrupt allocation from arch_setup_msi_irqs() in preparation for enabling registration of multiple MSIs. Code movement only, no change of functionality intended. Signed-off-by: Gerd Bayer Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_irq.c | 54 +++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 0ef83b6ac0db..979f776b09b8 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -268,33 +268,20 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, } } -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, + unsigned long *bit) { - struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs, cpu; - unsigned long bit; - struct msi_desc *msi; - struct msi_msg msg; - int cpu_addr; - int rc, irq; - - zdev->aisb = -1UL; - zdev->msi_first_bit = -1U; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); - if (irq_delivery == DIRECTED) { /* Allocate cpu vector bits */ - bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); - if (bit == -1UL) + *bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); + if (*bit == -1UL) return -EIO; } else { /* Allocate adapter summary indicator bit */ - bit = airq_iv_alloc_bit(zpci_sbv); - if (bit == -1UL) + *bit = airq_iv_alloc_bit(zpci_sbv); + if (*bit == -1UL) return -EIO; - zdev->aisb = bit; + zdev->aisb = *bit; /* Create adapter interrupt vector */ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); @@ -302,10 +289,33 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return -ENOMEM; /* Wire up shortcut pointer */ - zpci_ibv[bit] = zdev->aibv; + zpci_ibv[*bit] = zdev->aibv; /* Each function has its own interrupt vector */ - bit = 0; + *bit = 0; } + return 0; +} + +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct zpci_dev *zdev = to_zpci(pdev); + unsigned int hwirq, msi_vecs, cpu; + struct msi_desc *msi; + struct msi_msg msg; + unsigned long bit; + int cpu_addr; + int rc, irq; + + zdev->aisb = -1UL; + zdev->msi_first_bit = -1U; + + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + + rc = __alloc_airq(zdev, msi_vecs, &bit); + if (rc < 0) + return rc; /* Request MSI interrupts */ hwirq = bit; -- cgit From ab42fcb511fd9d241bbab7cc3ca04e34e9fc0666 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Thu, 11 Jul 2024 15:45:27 +0200 Subject: s390/pci: Allow allocation of more than 1 MSI interrupt On a PCI adapter that provides up to 8 MSI interrupt sources the s390 implementation of PCI interrupts rejected to accommodate them, although the underlying hardware is able to support that. For MSI-X it is sufficient to allocate a single irq_desc per msi_desc, but for MSI multiple irq descriptors are attached to and controlled by a single msi descriptor. Add the appropriate loops to maintain multiple irq descriptors and tie/untie them to/from the appropriate AIBV bit, if a device driver allocates more than 1 MSI interrupt. Common PCI code passes on requests to allocate a number of interrupt vectors based on the device drivers' demand and the PCI functions' capabilities. However, the root-complex of s390 systems support just a limited number of interrupt vectors per PCI function. Produce a kernel log message to inform about any architecture-specific capping that might be done. With this change, we had a PCI adapter successfully raising interrupts to its device driver via all 8 sources. Fixes: a384c8924a8b ("s390/PCI: Fix single MSI only check") Signed-off-by: Gerd Bayer Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_irq.c | 62 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 979f776b09b8..84482a921332 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -298,8 +298,8 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs, int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { + unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu; struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs, cpu; struct msi_desc *msi; struct msi_msg msg; unsigned long bit; @@ -309,30 +309,46 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) zdev->aisb = -1UL; zdev->msi_first_bit = -1U; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + if (msi_vecs < nvec) { + pr_info("%s requested %d irqs, allocate system limit of %d", + pci_name(pdev), nvec, zdev->max_msi); + } rc = __alloc_airq(zdev, msi_vecs, &bit); if (rc < 0) return rc; - /* Request MSI interrupts */ + /* + * Request MSI interrupts: + * When using MSI, nvec_used interrupt sources and their irq + * descriptors are controlled through one msi descriptor. + * Thus the outer loop over msi descriptors shall run only once, + * while two inner loops iterate over the interrupt vectors. + * When using MSI-X, each interrupt vector/irq descriptor + * is bound to exactly one msi descriptor (nvec_used is one). + * So the inner loops are executed once, while the outer iterates + * over the MSI-X descriptors. + */ hwirq = bit; msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) { - rc = -EIO; if (hwirq - bit >= msi_vecs) break; - irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, - (irq_delivery == DIRECTED) ? - msi->affinity : NULL); + irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used); + irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE, + (irq_delivery == DIRECTED) ? + msi->affinity : NULL); if (irq < 0) return -ENOMEM; - rc = irq_set_msi_desc(irq, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq, &zpci_irq_chip, - handle_percpu_irq); + + for (i = 0; i < irqs_per_msi; i++) { + rc = irq_set_msi_desc_off(irq, i, msi); + if (rc) + return rc; + irq_set_chip_and_handler(irq + i, &zpci_irq_chip, + handle_percpu_irq); + } + msg.data = hwirq - bit; if (irq_delivery == DIRECTED) { if (msi->affinity) @@ -345,31 +361,35 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) msg.address_lo |= (cpu_addr << 8); for_each_possible_cpu(cpu) { - airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); + for (i = 0; i < irqs_per_msi; i++) + airq_iv_set_data(zpci_ibv[cpu], + hwirq + i, irq + i); } } else { msg.address_lo = zdev->msi_addr & 0xffffffff; - airq_iv_set_data(zdev->aibv, hwirq, irq); + for (i = 0; i < irqs_per_msi; i++) + airq_iv_set_data(zdev->aibv, hwirq + i, irq + i); } msg.address_hi = zdev->msi_addr >> 32; pci_write_msi_msg(irq, &msg); - hwirq++; + hwirq += irqs_per_msi; } zdev->msi_first_bit = bit; - zdev->msi_nr_irqs = msi_vecs; + zdev->msi_nr_irqs = hwirq - bit; rc = zpci_set_irq(zdev); if (rc) return rc; - return (msi_vecs == nvec) ? 0 : msi_vecs; + return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs; } void arch_teardown_msi_irqs(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); struct msi_desc *msi; + unsigned int i; int rc; /* Disable interrupts */ @@ -379,8 +399,10 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) /* Release MSI interrupts */ msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) { - irq_set_msi_desc(msi->irq, NULL); - irq_free_desc(msi->irq); + for (i = 0; i < msi->nvec_used; i++) { + irq_set_msi_desc(msi->irq + i, NULL); + irq_free_desc(msi->irq + i); + } msi->msg.address_lo = 0; msi->msg.address_hi = 0; msi->msg.data = 0; -- cgit From e188e5d5ffd01d484b5255b88739fcf67b300223 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 11 Jul 2024 15:50:26 +0200 Subject: s390/setup: Fix __pa/__va for modules under non-GPL licenses The struct vm_layout contains fields used in __pa/__va calculations. Such fundamental things have to be exported with EXPORT_SYMBOL to avoid breakages of out-of-tree modules under non-GPL licenses. Fixes: 7de0446f0b26 ("s390/boot: Make identity mapping base address explicit") Acked-by: Heiko Carstens Acked-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3993f4caf224..1faba11d5f0b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -149,7 +149,7 @@ unsigned long __bootdata_preserved(max_mappable); struct physmem_info __bootdata(physmem_info); struct vm_layout __bootdata_preserved(vm_layout); -EXPORT_SYMBOL_GPL(vm_layout); +EXPORT_SYMBOL(vm_layout); int __bootdata_preserved(__kaslr_enabled); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); -- cgit From ec25f99cc834644e6577fa11582f7691589ed8cc Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 23 Jul 2024 14:44:11 +0200 Subject: s390/kmsan: Fix merge conflict with get_lowcore() introduction Resolve the conflict between commit 2a48c8c9cf87 ("s390/kmsan: implement the architecture-specific functions") and commit 39976f1278a9 ("s390: Remove S390_lowcore"). Fixes: 2a48c8c9cf87 ("s390/kmsan: implement the architecture-specific functions") Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20240723124441.120044-2-iii@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/kmsan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/kmsan.h b/arch/s390/include/asm/kmsan.h index 27db65fbf3f6..f73e181d09ae 100644 --- a/arch/s390/include/asm/kmsan.h +++ b/arch/s390/include/asm/kmsan.h @@ -12,8 +12,8 @@ static inline bool is_lowcore_addr(void *addr) { - return addr >= (void *)&S390_lowcore && - addr < (void *)(&S390_lowcore + 1); + return addr >= (void *)get_lowcore() && + addr < (void *)(get_lowcore() + 1); } static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) @@ -25,7 +25,7 @@ static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) * order to get a distinct struct page. */ addr += (void *)lowcore_ptr[raw_smp_processor_id()] - - (void *)&S390_lowcore; + (void *)get_lowcore(); if (KMSAN_WARN_ON(is_lowcore_addr(addr))) return NULL; return kmsan_get_metadata(addr, is_origin); -- cgit From 19af288706b25f2213e85b8b2df140c04fd7c63d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 23 Jul 2024 14:44:12 +0200 Subject: s390/ptdump: Add KMSAN page markers Add KMSAN vmalloc metadata areas to /sys/kernel/debug/kernel_page_tables. Example output: 0x000003a95fff9000-0x000003a960000000 28K PTE I ---[ vmalloc Area End ]--- ---[ Kmsan vmalloc Shadow Start ]--- 0x000003a960000000-0x000003a960010000 64K PTE RW NX [...] 0x000003d3dfff9000-0x000003d3e0000000 28K PTE I ---[ Kmsan vmalloc Shadow End ]--- ---[ Kmsan vmalloc Origins Start ]--- 0x000003d3e0000000-0x000003d3e0010000 64K PTE RW NX [...] 0x000003fe5fff9000-0x000003fe60000000 28K PTE I ---[ Kmsan vmalloc Origins End ]--- ---[ Kmsan Modules Shadow Start ]--- 0x000003fe60000000-0x000003fe60001000 4K PTE RW NX [...] 0x000003fe60100000-0x000003fee0000000 2047M PMD I ---[ Kmsan Modules Shadow End ]--- ---[ Kmsan Modules Origins Start ]--- 0x000003fee0000000-0x000003fee0001000 4K PTE RW NX [...] 0x000003fee0100000-0x000003ff60000000 2047M PMD I ---[ Kmsan Modules Origins End ]--- ---[ Modules Area Start ]--- 0x000003ff60000000-0x000003ff60001000 4K PTE RO X Signed-off-by: Ilya Leoshkevich Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20240723124441.120044-3-iii@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/s390/mm/dump_pagetables.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 45db5f47b22d..98dab3e049de 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -36,6 +36,16 @@ enum address_markers_idx { VMEMMAP_END_NR, VMALLOC_NR, VMALLOC_END_NR, +#ifdef CONFIG_KMSAN + KMSAN_VMALLOC_SHADOW_START_NR, + KMSAN_VMALLOC_SHADOW_END_NR, + KMSAN_VMALLOC_ORIGIN_START_NR, + KMSAN_VMALLOC_ORIGIN_END_NR, + KMSAN_MODULES_SHADOW_START_NR, + KMSAN_MODULES_SHADOW_END_NR, + KMSAN_MODULES_ORIGIN_START_NR, + KMSAN_MODULES_ORIGIN_END_NR, +#endif MODULES_NR, MODULES_END_NR, ABS_LOWCORE_NR, @@ -65,6 +75,16 @@ static struct addr_marker address_markers[] = { [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, [VMALLOC_NR] = {0, "vmalloc Area Start"}, [VMALLOC_END_NR] = {0, "vmalloc Area End"}, +#ifdef CONFIG_KMSAN + [KMSAN_VMALLOC_SHADOW_START_NR] = {0, "Kmsan vmalloc Shadow Start"}, + [KMSAN_VMALLOC_SHADOW_END_NR] = {0, "Kmsan vmalloc Shadow End"}, + [KMSAN_VMALLOC_ORIGIN_START_NR] = {0, "Kmsan vmalloc Origins Start"}, + [KMSAN_VMALLOC_ORIGIN_END_NR] = {0, "Kmsan vmalloc Origins End"}, + [KMSAN_MODULES_SHADOW_START_NR] = {0, "Kmsan Modules Shadow Start"}, + [KMSAN_MODULES_SHADOW_END_NR] = {0, "Kmsan Modules Shadow End"}, + [KMSAN_MODULES_ORIGIN_START_NR] = {0, "Kmsan Modules Origins Start"}, + [KMSAN_MODULES_ORIGIN_END_NR] = {0, "Kmsan Modules Origins End"}, +#endif [MODULES_NR] = {0, "Modules Area Start"}, [MODULES_END_NR] = {0, "Modules Area End"}, [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, @@ -306,6 +326,16 @@ static int pt_dump_init(void) #ifdef CONFIG_KFENCE address_markers[KFENCE_START_NR].start_address = kfence_start; address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE; +#endif +#ifdef CONFIG_KMSAN + address_markers[KMSAN_VMALLOC_SHADOW_START_NR].start_address = KMSAN_VMALLOC_SHADOW_START; + address_markers[KMSAN_VMALLOC_SHADOW_END_NR].start_address = KMSAN_VMALLOC_SHADOW_END; + address_markers[KMSAN_VMALLOC_ORIGIN_START_NR].start_address = KMSAN_VMALLOC_ORIGIN_START; + address_markers[KMSAN_VMALLOC_ORIGIN_END_NR].start_address = KMSAN_VMALLOC_ORIGIN_END; + address_markers[KMSAN_MODULES_SHADOW_START_NR].start_address = KMSAN_MODULES_SHADOW_START; + address_markers[KMSAN_MODULES_SHADOW_END_NR].start_address = KMSAN_MODULES_SHADOW_END; + address_markers[KMSAN_MODULES_ORIGIN_START_NR].start_address = KMSAN_MODULES_ORIGIN_START; + address_markers[KMSAN_MODULES_ORIGIN_END_NR].start_address = KMSAN_MODULES_ORIGIN_END; #endif sort_address_markers(); #ifdef CONFIG_PTDUMP_DEBUGFS -- cgit From e6ce1f12d777f6ee22b20e10ae6a771e7e6f44f5 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 15 Jul 2024 12:07:29 +0200 Subject: s390/cpum_cf: Fix endless loop in CF_DIAG event stop Event CF_DIAG reads out complete counter sets using stcctm instruction. This is done at event start time when the process starts execution and at event stop time when the process is removed from the CPU. During removal the difference of each counter in the counter sets is calculated and saved as raw data in the ring buffer. This works fine unless the number of counters in a counter set is zero. This may happen for the extended counter set. This set is machine specific and the size of the counter set can be zero even when extended counter set is authorized for read access. This case is not handled. cfdiag_diffctr() checks authorization of the extended counter set. If true the functions assumes the extended counter set has been saved in a data buffer. However this is not the case, cfdiag_getctrset() does not save a counter set with counter set size of zero. This mismatch causes an endless loop in the counter set readout during event stop handling. The calculation of the difference of the counters in each counter now verifies the size of the counter set is non-zero. A counter set with size zero is skipped. Fixes: a029a4eab39e ("s390/cpumf: Allow concurrent access for CPU Measurement Counter Facility") Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Acked-by: Heiko Carstens Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_cf.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 1434642e9cba..6968be98af11 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -556,25 +556,31 @@ static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) struct cf_trailer_entry *trailer_start, *trailer_stop; struct cf_ctrset_entry *ctrstart, *ctrstop; size_t offset = 0; + int i; - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + /* Counter set not authorized */ + if (!(auth & cpumf_ctr_ctl[i])) + continue; + /* Counter set size zero was not saved */ + if (!cpum_cf_read_setsize(i)) + continue; + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { pr_err_once("cpum_cf_diag counter set compare error " "in set %i\n", ctrstart->set); return 0; } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; if (ctrstart->def == CF_DIAG_CTRSET_DEF) { cfdiag_diffctrset((u64 *)(ctrstart + 1), (u64 *)(ctrstop + 1), ctrstart->ctr); offset += ctrstart->ctr * sizeof(u64) + sizeof(*ctrstart); } - } while (ctrstart->def && auth); + } /* Save time_stamp from start of event in stop's trailer */ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); -- cgit From b798b685b42c9dbe508e59a74250d97c41bec35e Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 17 Jul 2024 21:43:22 +0200 Subject: s390/boot: Do not assume the decompressor range is reserved When allocating a random memory range for .amode31 sections the minimal randomization address is 0. That does not lead to a possible overlap with the decompressor image (which also starts from 0) since by that time the image range is already reserved. Do not assume the decompressor range is reserved and always provide the minimal randomization address for .amode31 sections beyond the decompressor. That is a prerequisite for moving the lowcore memory address from NULL elsewhere. Signed-off-by: Alexander Gordeev Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index c59014945af0..cc8753c0c121 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -478,8 +478,12 @@ void startup_kernel(void) * before the kernel started. Therefore, in case the two sections * overlap there is no risk of corrupting any data. */ - if (kaslr_enabled()) - amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G); + if (kaslr_enabled()) { + unsigned long amode31_min; + + amode31_min = (unsigned long)_decompressor_end; + amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G); + } if (!amode31_lma) amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size; physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); -- cgit From a795eeaf851b91c79fc357a245ca72fd1e7df906 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 16 Jul 2024 09:26:13 +0200 Subject: s390/smp: Handle restart interrupt on ipl cpu The current smp code allows to trigger a restart interrupt on CPUs offline in linux. To allow using the percpu infrastructure instead of the pcpu_devices array, switch to the ipl cpu which is always online before calling do_restart(). Reviewed-by: Heiko Carstens Reviewed-by: Alexander Gordeev Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/smp.h | 1 - arch/s390/kernel/ipl.c | 2 +- arch/s390/kernel/smp.c | 15 --------------- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index c13c79025348..cd835f4fb11a 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -24,7 +24,6 @@ extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); -extern void smp_call_online_cpu(void (*func)(void *), void *); extern void smp_call_ipl_cpu(void (*func)(void *), void *); extern void smp_emergency_stop(void); diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 3a7d6e172211..f17bb7bf9392 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2112,7 +2112,7 @@ void do_restart(void *arg) tracing_off(); debug_locks_off(); lgr_info_log(); - smp_call_online_cpu(__do_restart, arg); + smp_call_ipl_cpu(__do_restart, arg); } /* on halt */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index c3c54adf67bc..1e1290525423 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -382,21 +382,6 @@ static int pcpu_set_smt(unsigned int mtid) return cc; } -/* - * Call function on an online CPU. - */ -void smp_call_online_cpu(void (*func)(void *), void *data) -{ - struct pcpu *pcpu; - - /* Use the current cpu if it is online. */ - pcpu = pcpu_find_address(cpu_online_mask, stap()); - if (!pcpu) - /* Use the first online cpu. */ - pcpu = pcpu_devices + cpumask_first(cpu_online_mask); - pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack); -} - /* * Call function on the ipl CPU. */ -- cgit From 90fc5ac28235843b1d5070c5dac9b01e7d39b24c Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 16 Jul 2024 09:26:14 +0200 Subject: s390/smp: Switch pcpu_devices to percpu In preparation of moving the CIF flags from lowcore to pcpu_devices, convert the pcpu_devices array to use the percpu infrastructure. This is required because using the pcpu_devices array as it is would introduce a performance penalty due to the fact that CPU flags for multiple CPUs would end up in the same cacheline. Note that a pointer to the pcpu struct of the IPL CPU is still required. This is because a restart interrupt can be triggered on an offline CPU. s390 stores the percpu offset in lowcore, but offline CPUs have no lowcore area allocated. So percpu data cannot be used from an offline CPU and we need to get the pcpu pointer for the IPL cpu from somewhere else. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/smp.c | 113 +++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1e1290525423..b36b089b9a26 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -83,7 +83,14 @@ struct pcpu { }; static u8 boot_core_type; -static struct pcpu pcpu_devices[NR_CPUS]; +static DEFINE_PER_CPU(struct pcpu, pcpu_devices); +/* + * Pointer to the pcpu area of the boot CPU. This is required when a restart + * interrupt is triggered on an offline CPU. For that case accessing percpu + * data with the common primitives does not work, since the percpu offset is + * stored in a non existent lowcore. + */ +static struct pcpu *ipl_pcpu; unsigned int smp_cpu_mt_shift; EXPORT_SYMBOL(smp_cpu_mt_shift); @@ -174,8 +181,8 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) int cpu; for_each_cpu(cpu, mask) - if (pcpu_devices[cpu].address == address) - return pcpu_devices + cpu; + if (per_cpu(pcpu_devices, cpu).address == address) + return &per_cpu(pcpu_devices, cpu); return NULL; } @@ -230,13 +237,11 @@ out: return -ENOMEM; } -static void pcpu_free_lowcore(struct pcpu *pcpu) +static void pcpu_free_lowcore(struct pcpu *pcpu, int cpu) { unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET; async_stack = lc->async_stack - STACK_INIT_OFFSET; @@ -277,12 +282,10 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) arch_spin_lock_setup(cpu); } -static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) +static void pcpu_attach_task(int cpu, struct task_struct *tsk) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET; lc->current_task = (unsigned long)tsk; @@ -296,18 +299,16 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) lc->steal_timer = 0; } -static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) +static void pcpu_start_fn(int cpu, void (*func)(void *), void *data) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; lc->restart_stack = lc->kernel_stack; lc->restart_fn = (unsigned long) func; lc->restart_data = (unsigned long) data; lc->restart_source = -1U; - pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); + pcpu_sigp_retry(per_cpu_ptr(&pcpu_devices, cpu), SIGP_RESTART, 0); } typedef void (pcpu_delegate_fn)(void *); @@ -320,14 +321,14 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) func(data); /* should not return */ } -static void pcpu_delegate(struct pcpu *pcpu, +static void pcpu_delegate(struct pcpu *pcpu, int cpu, pcpu_delegate_fn *func, void *data, unsigned long stack) { struct lowcore *lc, *abs_lc; unsigned int source_cpu; - lc = lowcore_ptr[pcpu - pcpu_devices]; + lc = lowcore_ptr[cpu]; source_cpu = stap(); if (pcpu->address == source_cpu) { @@ -377,7 +378,7 @@ static int pcpu_set_smt(unsigned int mtid) smp_cpu_mt_shift = 0; while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift)) smp_cpu_mt_shift++; - pcpu_devices[0].address = stap(); + per_cpu(pcpu_devices, 0).address = stap(); } return cc; } @@ -389,11 +390,10 @@ void smp_call_ipl_cpu(void (*func)(void *), void *data) { struct lowcore *lc = lowcore_ptr[0]; - if (pcpu_devices[0].address == stap()) + if (ipl_pcpu->address == stap()) lc = get_lowcore(); - pcpu_delegate(&pcpu_devices[0], func, data, - lc->nodat_stack); + pcpu_delegate(ipl_pcpu, 0, func, data, lc->nodat_stack); } int smp_find_processor_id(u16 address) @@ -401,21 +401,21 @@ int smp_find_processor_id(u16 address) int cpu; for_each_present_cpu(cpu) - if (pcpu_devices[cpu].address == address) + if (per_cpu(pcpu_devices, cpu).address == address) return cpu; return -1; } void schedule_mcck_handler(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_mcck_pending); } bool notrace arch_vcpu_is_preempted(int cpu) { if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) return false; - if (pcpu_running(pcpu_devices + cpu)) + if (pcpu_running(per_cpu_ptr(&pcpu_devices, cpu))) return false; return true; } @@ -427,7 +427,7 @@ void notrace smp_yield_cpu(int cpu) return; diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" - : : "d" (pcpu_devices[cpu].address)); + : : "d" (per_cpu(pcpu_devices, cpu).address)); } EXPORT_SYMBOL_GPL(smp_yield_cpu); @@ -448,7 +448,7 @@ void notrace smp_emergency_stop(void) end = get_tod_clock() + (1000000UL << 12); for_each_cpu(cpu, &cpumask) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); set_bit(ec_stop_cpu, &pcpu->ec_mask); while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, 0, NULL) == SIGP_CC_BUSY && @@ -457,7 +457,7 @@ void notrace smp_emergency_stop(void) } while (get_tod_clock() < end) { for_each_cpu(cpu, &cpumask) - if (pcpu_stopped(pcpu_devices + cpu)) + if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu))) cpumask_clear_cpu(cpu, &cpumask); if (cpumask_empty(&cpumask)) break; @@ -472,6 +472,7 @@ NOKPROBE_SYMBOL(smp_emergency_stop); */ void smp_send_stop(void) { + struct pcpu *pcpu; int cpu; /* Disable all interrupts/machine checks */ @@ -487,8 +488,9 @@ void smp_send_stop(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); - while (!pcpu_stopped(pcpu_devices + cpu)) + pcpu = per_cpu_ptr(&pcpu_devices, cpu); + pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu)) cpu_relax(); } } @@ -502,7 +504,7 @@ static void smp_handle_ext_call(void) unsigned long bits; /* handle bit signal external calls */ - bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0); + bits = this_cpu_xchg(pcpu_devices.ec_mask, 0); if (test_bit(ec_stop_cpu, &bits)) smp_stop_cpu(); if (test_bit(ec_schedule, &bits)) @@ -527,12 +529,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) int cpu; for_each_cpu(cpu, mask) - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } void arch_send_call_function_single_ipi(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } /* @@ -542,13 +544,13 @@ void arch_send_call_function_single_ipi(int cpu) */ void arch_smp_send_reschedule(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_schedule); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_schedule); } #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_irq_work); } #endif @@ -560,7 +562,7 @@ int smp_store_status(int cpu) struct pcpu *pcpu; unsigned long pa; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); lc = lowcore_ptr[cpu]; pa = __pa(&lc->floating_pt_save_area); if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, @@ -668,17 +670,17 @@ void __init smp_save_dump_secondary_cpus(void) void smp_cpu_set_polarization(int cpu, int val) { - pcpu_devices[cpu].polarization = val; + per_cpu(pcpu_devices, cpu).polarization = val; } int smp_cpu_get_polarization(int cpu) { - return pcpu_devices[cpu].polarization; + return per_cpu(pcpu_devices, cpu).polarization; } int smp_cpu_get_cpu_address(int cpu) { - return pcpu_devices[cpu].address; + return per_cpu(pcpu_devices, cpu).address; } static void __ref smp_get_core_info(struct sclp_core_info *info, int early) @@ -717,7 +719,7 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) { if (pcpu_find_address(cpu_present_mask, address + i)) continue; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); pcpu->address = address + i; if (configured) pcpu->state = CPU_STATE_CONFIGURED; @@ -752,7 +754,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) * that all SMT threads get subsequent logical CPU numbers. */ if (early) { - core_id = pcpu_devices[0].address >> smp_cpu_mt_shift; + core_id = per_cpu(pcpu_devices, 0).address >> smp_cpu_mt_shift; for (i = 0; i < info->configured; i++) { core = &info->core[i]; if (core->core_id == core_id) { @@ -852,7 +854,7 @@ static void smp_start_secondary(void *cpuvoid) /* Upping and downing of CPUs */ int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); int rc; if (pcpu->state != CPU_STATE_CONFIGURED) @@ -870,8 +872,8 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) */ system_ctlreg_lock(); pcpu_prepare_secondary(pcpu, cpu); - pcpu_attach_task(pcpu, tidle); - pcpu_start_fn(pcpu, smp_start_secondary, NULL); + pcpu_attach_task(cpu, tidle); + pcpu_start_fn(cpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ while (!cpu_online(cpu)) cpu_relax(); @@ -916,10 +918,10 @@ void __cpu_die(unsigned int cpu) struct pcpu *pcpu; /* Wait until target cpu is down */ - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); while (!pcpu_stopped(pcpu)) cpu_relax(); - pcpu_free_lowcore(pcpu); + pcpu_free_lowcore(pcpu, cpu); cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); } @@ -927,7 +929,7 @@ void __cpu_die(unsigned int cpu) void __noreturn cpu_die(void) { idle_task_exit(); - pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); + pcpu_sigp_retry(this_cpu_ptr(&pcpu_devices), SIGP_STOP, 0); for (;;) ; } @@ -957,10 +959,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { - struct pcpu *pcpu = pcpu_devices; - WARN_ON(!cpu_present(0) || !cpu_online(0)); - pcpu->state = CPU_STATE_CONFIGURED; + ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0); + ipl_pcpu->state = CPU_STATE_CONFIGURED; get_lowcore()->percpu_offset = __per_cpu_offset[0]; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); } @@ -969,8 +970,8 @@ void __init smp_setup_processor_id(void) { struct lowcore *lc = get_lowcore(); - pcpu_devices[0].address = stap(); lc->cpu_nr = 0; + per_cpu(pcpu_devices, 0).address = stap(); lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; } @@ -992,7 +993,7 @@ static ssize_t cpu_configure_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state); + count = sprintf(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -1018,7 +1019,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_online(cpu + i)) goto out; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); rc = 0; switch (val) { case 0: @@ -1030,7 +1031,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_STANDBY; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1045,7 +1046,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_CONFIGURED; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_CONFIGURED; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1064,7 +1065,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); static ssize_t show_cpu_address(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pcpu_devices[dev->id].address); + return sprintf(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address); } static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); @@ -1090,14 +1091,14 @@ static struct attribute_group cpu_online_attr_group = { static int smp_cpu_online(unsigned int cpu) { - struct cpu *c = &per_cpu(cpu_devices, cpu); + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group); } static int smp_cpu_pre_down(unsigned int cpu) { - struct cpu *c = &per_cpu(cpu_devices, cpu); + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group); return 0; @@ -1110,7 +1111,7 @@ bool arch_cpu_is_hotpluggable(int cpu) int arch_register_cpu(int cpu) { - struct cpu *c = &per_cpu(cpu_devices, cpu); + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); int rc; c->hotpluggable = arch_cpu_is_hotpluggable(cpu); -- cgit From d3604ffba1521f59f312be3f19999084dddef446 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 16 Jul 2024 09:26:15 +0200 Subject: s390: Move CIF flags to struct pcpu To allow testing flags for offline CPUs, move the CIF flags to struct pcpu. To avoid having to calculate the array index for each access, add a pointer to the pcpu member for the current cpu to lowcore. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/lowcore.h | 3 +-- arch/s390/include/asm/processor.h | 26 ++++++++++++++++++++------ arch/s390/kernel/asm-offsets.c | 4 +++- arch/s390/kernel/entry.S | 3 ++- arch/s390/kernel/setup.c | 1 + arch/s390/kernel/smp.c | 17 +++++++---------- 6 files changed, 34 insertions(+), 20 deletions(-) diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index c724e71e1785..bce3a69ab2a3 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -97,8 +97,7 @@ struct lowcore { __u64 save_area_async[8]; /* 0x0240 */ __u64 save_area_restart[1]; /* 0x0280 */ - /* CPU flags. */ - __u64 cpu_flags; /* 0x0288 */ + __u64 pcpu; /* 0x0288 */ /* Return psws. */ psw_t return_psw; /* 0x0290 */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index c87cf2b8e81a..5debb12614ad 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -42,21 +42,37 @@ #include #include +struct pcpu { + unsigned long ec_mask; /* bit mask for ec_xxx functions */ + unsigned long ec_clk; /* sigp timestamp for ec_xxx */ + unsigned long flags; /* per CPU flags */ + signed char state; /* physical cpu state */ + signed char polarization; /* physical polarization */ + u16 address; /* physical cpu address */ +}; + +DECLARE_PER_CPU(struct pcpu, pcpu_devices); + typedef long (*sys_call_ptr_t)(struct pt_regs *regs); +static __always_inline struct pcpu *this_pcpu(void) +{ + return (struct pcpu *)(get_lowcore()->pcpu); +} + static __always_inline void set_cpu_flag(int flag) { - get_lowcore()->cpu_flags |= (1UL << flag); + this_pcpu()->flags |= (1UL << flag); } static __always_inline void clear_cpu_flag(int flag) { - get_lowcore()->cpu_flags &= ~(1UL << flag); + this_pcpu()->flags &= ~(1UL << flag); } static __always_inline bool test_cpu_flag(int flag) { - return get_lowcore()->cpu_flags & (1UL << flag); + return this_pcpu()->flags & (1UL << flag); } static __always_inline bool test_and_set_cpu_flag(int flag) @@ -81,9 +97,7 @@ static __always_inline bool test_and_clear_cpu_flag(int flag) */ static __always_inline bool test_cpu_flag_of(int flag, int cpu) { - struct lowcore *lc = lowcore_ptr[cpu]; - - return lc->cpu_flags & (1UL << flag); + return per_cpu(pcpu_devices, cpu).flags & (1UL << flag); } #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY) diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 26bb45d0e6f1..58fc6b93b475 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -114,7 +114,7 @@ int main(void) OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); - OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); + OFFSET(__LC_PCPU, lowcore, pcpu); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); @@ -186,5 +186,7 @@ int main(void) #endif OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs); DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs)); + + OFFSET(__PCPU_FLAGS, pcpu, flags); return 0; } diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 454b6b92c7f8..fa58bd2c48c9 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -480,7 +480,8 @@ SYM_CODE_START(mcck_int_handler) clgrjl %r9,%r14, 4f larl %r14,.Lsie_leave clgrjhe %r9,%r14, 4f - oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST + lg %r10,__LC_PCPU + oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST 4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST SIEEXIT __SF_SIE_CONTROL(%r15) #endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 1faba11d5f0b..178daf4e3563 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -406,6 +406,7 @@ static void __init setup_lowcore(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); + lc->pcpu = (unsigned long)per_cpu_ptr(&pcpu_devices, 0); lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; lc->restart_psw.addr = __pa(restart_int_handler); lc->external_new_psw.mask = PSW_KERNEL_BITS; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index b36b089b9a26..fbba37ec53cf 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -74,16 +74,8 @@ enum { CPU_STATE_CONFIGURED, }; -struct pcpu { - unsigned long ec_mask; /* bit mask for ec_xxx functions */ - unsigned long ec_clk; /* sigp timestamp for ec_xxx */ - signed char state; /* physical cpu state */ - signed char polarization; /* physical polarization */ - u16 address; /* physical cpu address */ -}; - static u8 boot_core_type; -static DEFINE_PER_CPU(struct pcpu, pcpu_devices); +DEFINE_PER_CPU(struct pcpu, pcpu_devices); /* * Pointer to the pcpu area of the boot CPU. This is required when a restart * interrupt is triggered on an offline CPU. For that case accessing percpu @@ -264,6 +256,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; + lc->pcpu = (unsigned long)pcpu; lc->restart_flags = RESTART_FLAG_CTLREGS; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; @@ -924,6 +917,7 @@ void __cpu_die(unsigned int cpu) pcpu_free_lowcore(pcpu, cpu); cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); + pcpu->flags = 0; } void __noreturn cpu_die(void) @@ -959,10 +953,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { + struct lowcore *lc = get_lowcore(); + WARN_ON(!cpu_present(0) || !cpu_online(0)); + lc->percpu_offset = __per_cpu_offset[0]; ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0); ipl_pcpu->state = CPU_STATE_CONFIGURED; - get_lowcore()->percpu_offset = __per_cpu_offset[0]; + lc->pcpu = (unsigned long)ipl_pcpu; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); } -- cgit From 035248a7843242d51f249444fbad7340b7336f68 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 16 Jul 2024 13:50:48 +0200 Subject: s390/alternatives: Remove noaltinstr option The current Kernel doesn't boot without alternative patching on z16 machines. To avoid such bugs in the future, remove the option disable alternative patching. Signed-off-by: Sven Schnelle Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- Documentation/admin-guide/kernel-parameters.txt | 3 --- arch/s390/kernel/alternative.c | 21 ++------------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c1134ad5f06d..f1384c7b59c9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3830,9 +3830,6 @@ noalign [KNL,ARM] - noaltinstr [S390,EARLY] Disables alternative instructions - patching (CPU alternatives feature). - noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any IOAPICs that may be present in the system. diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 1ac5f707dd70..7971bc1bf496 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -7,18 +7,8 @@ #include #include -static int __initdata_or_module alt_instr_disabled; - -static int __init disable_alternative_instructions(char *str) -{ - alt_instr_disabled = 1; - return 0; -} - -early_param("noaltinstr", disable_alternative_instructions); - -static void __init_or_module __apply_alternatives(struct alt_instr *start, - struct alt_instr *end) +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; @@ -37,13 +27,6 @@ static void __init_or_module __apply_alternatives(struct alt_instr *start, } } -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) -{ - if (!alt_instr_disabled) - __apply_alternatives(start, end); -} - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; void __init apply_alternative_instructions(void) { -- cgit From 9be999a61232fd6748ebe8654c71bcde1a0fbed3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:49 +0200 Subject: s390/alternatives: Use consistent naming The alternative code is using the words facility and feature for the same. Rename facility to more generic feature everywhere to have consistent naming. Reviewed-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/alternative.h | 30 +++++++++++++++--------------- arch/s390/kernel/alternative.c | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index dd93b92c3ab6..07459553d64f 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -11,7 +11,7 @@ struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ - u16 facility; /* facility bit set for replacement */ + u16 feature; /* feature required for replacement */ u8 instrlen; /* length of original instruction */ } __packed; @@ -48,10 +48,10 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); #define OLDINSTR(oldinstr) \ "661:\n\t" oldinstr "\n662:\n" -#define ALTINSTR_ENTRY(facility, num) \ +#define ALTINSTR_ENTRY(feature, num) \ "\t.long 661b - .\n" /* old instruction */ \ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ - "\t.word " __stringify(facility) "\n" /* facility bit */ \ + "\t.word " __stringify(feature) "\n" /* feature */ \ "\t.byte " oldinstr_len "\n" /* instruction len */ \ "\t.org . - (" oldinstr_len ") & 1\n" \ "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \ @@ -61,24 +61,24 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" /* alternative assembly primitive: */ -#define ALTERNATIVE(oldinstr, altinstr, facility) \ +#define ALTERNATIVE(oldinstr, altinstr, feature) \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(altinstr, 1) \ ".popsection\n" \ OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(facility, 1) \ + ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" -#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\ +#define ALTERNATIVE_2(oldinstr, altinstr1, feature1, altinstr2, feature2)\ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(altinstr1, 1) \ ALTINSTR_REPLACEMENT(altinstr2, 2) \ ".popsection\n" \ OLDINSTR(oldinstr) \ ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(facility1, 1) \ - ALTINSTR_ENTRY(facility2, 2) \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" /* @@ -93,12 +93,12 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); * For non barrier like inlines please define new variants * without volatile and memory clobber. */ -#define alternative(oldinstr, altinstr, facility) \ - asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory") +#define alternative(oldinstr, altinstr, feature) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature) : : : "memory") -#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ - asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \ - altinstr2, facility2) ::: "memory") +#define alternative_2(oldinstr, altinstr1, feature1, altinstr2, feature2) \ + asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, feature1, \ + altinstr2, feature2) ::: "memory") /* Alternative inline assembly with input. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ @@ -106,8 +106,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); : : input) /* Like alternative_input, but with a single output argument */ -#define alternative_io(oldinstr, altinstr, facility, output, input...) \ - asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) \ +#define alternative_io(oldinstr, altinstr, feature, output, input...) \ + asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature) \ : output : input) /* Use this macro if more than one output parameter is needed. */ diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 7971bc1bf496..e2b6549f8eaf 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -21,7 +21,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - if (!__test_facility(a->facility, alt_stfle_fac_list)) + if (!__test_facility(a->feature, alt_stfle_fac_list)) continue; s390_kernel_write(instr, replacement, a->instrlen); } -- cgit From c77f7354c4478bf4560b546913e097b3d4ab50c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:50 +0200 Subject: s390/alternatives: Merge both alternative header files The two alternative header files must stay in sync. This is easier to achieve within one header file. Therefore merge both of them and have only one file, like most other architectures. Reviewed-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/alternative-asm.h | 57 --------------------------------- arch/s390/include/asm/alternative.h | 50 +++++++++++++++++++++++++++++ arch/s390/kernel/entry.S | 2 +- 3 files changed, 51 insertions(+), 58 deletions(-) delete mode 100644 arch/s390/include/asm/alternative-asm.h diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h deleted file mode 100644 index 608f6287ca9c..000000000000 --- a/arch/s390/include/asm/alternative-asm.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390_ALTERNATIVE_ASM_H -#define _ASM_S390_ALTERNATIVE_ASM_H - -#ifdef __ASSEMBLY__ - -/* - * Issue one struct alt_instr descriptor entry (need to put it into - * the section .altinstructions, see below). This entry contains - * enough information for the alternatives patching code to patch an - * instruction. See apply_alternatives(). - */ -.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature - .long \orig_start - . - .long \alt_start - . - .word \feature - .byte \orig_end - \orig_start - .org . - ( \orig_end - \orig_start ) & 1 - .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) - .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. - */ -.macro ALTERNATIVE oldinstr, newinstr, feature - .pushsection .altinstr_replacement,"ax" -770: \newinstr -771: .popsection -772: \oldinstr -773: .pushsection .altinstructions,"a" - alt_entry 772b, 773b, 770b, 771b, \feature - .popsection -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. - */ -.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 - .pushsection .altinstr_replacement,"ax" -770: \newinstr1 -771: \newinstr2 -772: .popsection -773: \oldinstr -774: .pushsection .altinstructions,"a" - alt_entry 773b, 774b, 770b, 771b,\feature1 - alt_entry 773b, 774b, 771b, 772b,\feature2 - .popsection -.endm - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_S390_ALTERNATIVE_ASM_H */ diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 07459553d64f..16de33750d6c 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -116,6 +116,56 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); /* Use this macro if clobbers are needed without inputs. */ #define ASM_NO_INPUT_CLOBBER(clobber...) : clobber +#else /* __ASSEMBLY__ */ + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature + .long \orig_start - . + .long \alt_start - . + .word \feature + .byte \orig_end - \orig_start + .org . - ( \orig_end - \orig_start ) & 1 + .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) + .org . - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start ) +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature + .pushsection .altinstr_replacement,"ax" +770: \newinstr +771: .popsection +772: \oldinstr +773: .pushsection .altinstructions,"a" + alt_entry 772b, 773b, 770b, 771b, \feature + .popsection +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 + .pushsection .altinstr_replacement,"ax" +770: \newinstr1 +771: \newinstr2 +772: .popsection +773: \oldinstr +774: .pushsection .altinstructions,"a" + alt_entry 773b, 774b, 770b, 771b,\feature1 + alt_entry 773b, 774b, 771b, 772b,\feature2 + .popsection +.endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_ALTERNATIVE_H */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index fa58bd2c48c9..866917ff013f 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include -- cgit From ace76fac944b06e46427e0406f315609f278ef91 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:51 +0200 Subject: s390/alternatives: Move text sync functions Move all text sync functions from alternative.c to processor.c. This way there is only minimal code left in alternative.c left, which is a prerequisite to use the C file within boot code as well. Reviewed-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/alternative.c | 20 -------------------- arch/s390/kernel/processor.c | 20 +++++++++++++++++++- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index e2b6549f8eaf..33debc2a26c9 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,8 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -#include -#include #include #include #include @@ -32,20 +29,3 @@ void __init apply_alternative_instructions(void) { apply_alternatives(__alt_instructions, __alt_instructions_end); } - -static void do_sync_core(void *info) -{ - sync_core(); -} - -void text_poke_sync(void) -{ - on_each_cpu(do_sync_core, NULL, 1); -} - -void text_poke_sync_lock(void) -{ - cpus_read_lock(); - text_poke_sync(); - cpus_read_unlock(); -} diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 65c1464eea4f..5ce9a795a0fe 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -17,7 +17,8 @@ #include #include #include - +#include +#include #include #include #include @@ -79,6 +80,23 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) } } +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} + /* * cpu_init - initializes state that is per-CPU. */ -- cgit From 030f7951c5b293739301fb616add0f1d3fb46073 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:52 +0200 Subject: s390/uaccess: Make s390_kernel_write() usable for decompressor To avoid lots of ifdefs in C code make s390_kernel_write() usable for the decompressor: simply use memcpy() for this case since there is no write protection enabled that early. Reviewed-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/uaccess.h | 9 ++++++++- arch/s390/mm/maccess.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9213be0529ee..a81f897a81ce 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -332,7 +332,14 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo return __clear_user(to, n); } -void *s390_kernel_write(void *dst, const void *src, size_t size); +void *__s390_kernel_write(void *dst, const void *src, size_t size); + +static inline void *s390_kernel_write(void *dst, const void *src, size_t size) +{ + if (__is_defined(__DECOMPRESSOR)) + return memcpy(dst, src, size); + return __s390_kernel_write(dst, src, size); +} int __noreturn __put_kernel_bad(void); diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 632c3a55feed..28a18c42ba99 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -48,7 +48,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz } /* - * s390_kernel_write - write to kernel memory bypassing DAT + * __s390_kernel_write - write to kernel memory bypassing DAT * @dst: destination address * @src: source address * @size: number of bytes to copy @@ -61,7 +61,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz */ static DEFINE_SPINLOCK(s390_kernel_write_lock); -notrace void *s390_kernel_write(void *dst, const void *src, size_t size) +notrace void *__s390_kernel_write(void *dst, const void *src, size_t size) { void *tmp = dst; unsigned long flags; -- cgit From b3e0c5f734f934dab1cfdef669e3baa165a0cbfe Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:53 +0200 Subject: s390/alternatives: Rework to allow for callbacks Rework alternatives to allow for callbacks. With this every alternative entry has additional data encoded: - When (aka context) an alternative is supposed to be applied - The type of an alternative, which allows for type specific handling and callbacks - Extra type specific payload (patch information), which can be passed to callbacks in order to decide if an alternative should be applied or not With this only the "late" context is implemented, which means there is no change to the previous behaviour. All code is just converted to the more generic new infrastructure. Reviewed-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/alternative.h | 66 ++++++++++++++++++++++++++++++++++--- arch/s390/include/asm/processor.h | 2 +- arch/s390/include/asm/spinlock.h | 2 +- arch/s390/kernel/alternative.c | 30 +++++++++-------- arch/s390/kernel/entry.S | 22 ++++++------- arch/s390/lib/spinlock.c | 4 +-- 6 files changed, 92 insertions(+), 34 deletions(-) diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 16de33750d6c..5b931070be16 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -2,6 +2,44 @@ #ifndef _ASM_S390_ALTERNATIVE_H #define _ASM_S390_ALTERNATIVE_H +/* + * Each alternative comes with a 32 bit feature field: + * union { + * u32 feature; + * struct { + * u32 ctx : 4; + * u32 type : 8; + * u32 data : 20; + * }; + * } + * + * @ctx is a bitfield, where only one bit must be set. Each bit defines + * in which context an alternative is supposed to be applied to the + * kernel image: + * + * - from the decompressor before the kernel itself is executed + * - from early kernel code from within the kernel + * + * @type is a number which defines the type and with that the type + * specific alternative patching. + * + * @data is additional type specific information which defines if an + * alternative should be applied. + */ + +#define ALT_CTX_LATE 1 +#define ALT_CTX_ALL ALT_CTX_LATE + +#define ALT_TYPE_FACILITY 0 + +#define ALT_DATA_SHIFT 0 +#define ALT_TYPE_SHIFT 20 +#define ALT_CTX_SHIFT 28 + +#define ALT_FACILITY(facility) (ALT_CTX_LATE << ALT_CTX_SHIFT | \ + ALT_TYPE_FACILITY << ALT_TYPE_SHIFT | \ + (facility) << ALT_DATA_SHIFT) + #ifndef __ASSEMBLY__ #include @@ -11,12 +49,30 @@ struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ - u16 feature; /* feature required for replacement */ + union { + u32 feature; /* feature required for replacement */ + struct { + u32 ctx : 4; /* context */ + u32 type : 8; /* type of alternative */ + u32 data : 20; /* patching information */ + }; + }; u8 instrlen; /* length of original instruction */ } __packed; -void apply_alternative_instructions(void); -void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + +void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx); + +static inline void apply_alternative_instructions(void) +{ + __apply_alternatives(__alt_instructions, __alt_instructions_end, ALT_CTX_LATE); +} + +static inline void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +{ + __apply_alternatives(start, end, ALT_CTX_ALL); +} /* * +---------------------------------+ @@ -51,7 +107,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); #define ALTINSTR_ENTRY(feature, num) \ "\t.long 661b - .\n" /* old instruction */ \ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ - "\t.word " __stringify(feature) "\n" /* feature */ \ + "\t.long " __stringify(feature) "\n" /* feature */ \ "\t.byte " oldinstr_len "\n" /* instruction len */ \ "\t.org . - (" oldinstr_len ") & 1\n" \ "\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n" \ @@ -127,7 +183,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end); .macro alt_entry orig_start, orig_end, alt_start, alt_end, feature .long \orig_start - . .long \alt_start - . - .word \feature + .long \feature .byte \orig_end - \orig_start .org . - ( \orig_end - \orig_start ) & 1 .org . - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start ) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 5debb12614ad..8a52554f49f0 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -419,7 +419,7 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) static __always_inline void bpon(void) { - asm volatile(ALTERNATIVE("nop", ".insn rrf,0xb2e80000,0,0,13,0", 82)); + asm volatile(ALTERNATIVE("nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82))); } #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 3e43c90ff135..77d5e804af93 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -79,7 +79,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) typecheck(int, lp->lock); kcsan_release(); asm_inline volatile( - ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */ " sth %1,%0\n" : "=R" (((unsigned short *) &lp->lock)[1]) : "d" (0) : "cc", "memory"); diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 33debc2a26c9..ecabdff89bce 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,31 +1,33 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include #include #include -#include -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) +void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx) { - struct alt_instr *a; u8 *instr, *replacement; + struct alt_instr *a; + bool replace; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ for (a = start; a < end; a++) { + if (!(a->ctx & ctx)) + continue; + switch (a->type) { + case ALT_TYPE_FACILITY: + replace = __test_facility(a->data, alt_stfle_fac_list); + break; + default: + replace = false; + } + if (!replace) + continue; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - - if (!__test_facility(a->feature, alt_stfle_fac_list)) - continue; s390_kernel_write(instr, replacement, a->instrlen); } } - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -void __init apply_alternative_instructions(void) -{ - apply_alternatives(__alt_instructions, __alt_instructions_end); -} diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 866917ff013f..90027a57a524 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -32,19 +32,19 @@ _LPP_OFFSET = __LC_LPP .macro STBEAR address - ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", ALT_FACILITY(193) .endm .macro LBEAR address - ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193) .endm .macro LPSWEY address,lpswe - ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 + ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193) .endm .macro MBEAR reg - ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), ALT_FACILITY(193) .endm .macro CHECK_STACK savearea @@ -100,22 +100,22 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_FACILITY(82) .endm .macro BPON - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82) .endm .macro BPENTER tif_ptr,tif_mask ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ - "j .+12; nop; nop", 82 + "j .+12; nop; nop", ALT_FACILITY(82) .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ - "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82 + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82) .endm #if IS_ENABLED(CONFIG_KVM) @@ -169,7 +169,7 @@ SYM_FUNC_START(__switch_to_asm) aghi %r3,__TASK_pid mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40) BR_EX %r14 SYM_FUNC_END(__switch_to_asm) @@ -515,7 +515,7 @@ SYM_CODE_START(mcck_int_handler) jno 0f BPON stpt __LC_EXIT_TIMER -0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 +0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA), ALT_FACILITY(193) LBEAR 0(%r12) lmg %r11,%r15,__PT_R11(%r11) LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE @@ -551,7 +551,7 @@ SYM_CODE_START(mcck_int_handler) SYM_CODE_END(mcck_int_handler) SYM_CODE_START(restart_int_handler) - ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40) stg %r15,__LC_SAVE_AREA_RESTART TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 jz 0f diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 0c9a73a18826..9f86ad8fa8b4 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -75,7 +75,7 @@ static inline int arch_load_niai4(int *lock) int owner; asm_inline volatile( - ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */ " l %0,%1\n" : "=d" (owner) : "Q" (*lock) : "memory"); return owner; @@ -86,7 +86,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) int expected = old; asm_inline volatile( - ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */ + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ " cs %0,%3,%1\n" : "=d" (old), "=Q" (*lock) : "0" (old), "d" (new), "Q" (*lock) -- cgit From 7f9d85998f6c5b989796470fd1ac066232c60723 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 16 Jul 2024 13:50:54 +0200 Subject: s390/alternatives: Allow early alternative patching in decompressor Add the required code to patch alternatives early in the decompressor. This is required for the upcoming lowcore relocation changes, where alternatives for facility 193 need to get patched before lowcore alternatives. Reviewed-by: Alexander Gordeev Co-developed-by: Heiko Carstens Signed-off-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/boot/Makefile | 2 +- arch/s390/boot/alternative.c | 3 +++ arch/s390/boot/boot.h | 2 ++ arch/s390/boot/startup.c | 5 +++++ arch/s390/include/asm/alternative.h | 12 +++++++++--- arch/s390/kernel/alternative.c | 5 +++++ arch/s390/kernel/vmlinux.lds.S | 2 ++ 7 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 arch/s390/boot/alternative.c diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index e7658997452b..5d8cb7e3b096 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -39,7 +39,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o +obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o diff --git a/arch/s390/boot/alternative.c b/arch/s390/boot/alternative.c new file mode 100644 index 000000000000..abc08d2c873d --- /dev/null +++ b/arch/s390/boot/alternative.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../kernel/alternative.c" diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 18027fdc92b0..ed2f0ec24f0d 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -30,6 +30,8 @@ struct vmlinux_info { unsigned long init_mm_off; unsigned long swapper_pg_dir_off; unsigned long invalid_pg_dir_off; + unsigned long alt_instructions; + unsigned long alt_instructions_end; #ifdef CONFIG_KASAN unsigned long kasan_early_shadow_page_off; unsigned long kasan_early_shadow_pte_off; diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index cc8753c0c121..cca2f1bad33c 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -376,6 +376,8 @@ static void kaslr_adjust_vmlinux_info(long offset) vmlinux.init_mm_off += offset; vmlinux.swapper_pg_dir_off += offset; vmlinux.invalid_pg_dir_off += offset; + vmlinux.alt_instructions += offset; + vmlinux.alt_instructions_end += offset; #ifdef CONFIG_KASAN vmlinux.kasan_early_shadow_page_off += offset; vmlinux.kasan_early_shadow_pte_off += offset; @@ -507,6 +509,9 @@ void startup_kernel(void) kaslr_adjust_got(__kaslr_offset); setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); copy_bootdata(); + __apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions, + (struct alt_instr *)_vmlinux_info.alt_instructions_end, + ALT_CTX_EARLY); /* * Save KASLR offset for early dumps, before vmcore_info is set. diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 5b931070be16..32c208332e57 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -27,15 +27,21 @@ * alternative should be applied. */ -#define ALT_CTX_LATE 1 -#define ALT_CTX_ALL ALT_CTX_LATE +#define ALT_CTX_EARLY 1 +#define ALT_CTX_LATE 2 +#define ALT_CTX_ALL (ALT_CTX_EARLY | ALT_CTX_LATE) -#define ALT_TYPE_FACILITY 0 +#define ALT_TYPE_FACILITY_EARLY 0 +#define ALT_TYPE_FACILITY 1 #define ALT_DATA_SHIFT 0 #define ALT_TYPE_SHIFT 20 #define ALT_CTX_SHIFT 28 +#define ALT_FACILITY_EARLY(facility) (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ + ALT_TYPE_FACILITY_EARLY << ALT_TYPE_SHIFT | \ + (facility) << ALT_DATA_SHIFT) + #define ALT_FACILITY(facility) (ALT_CTX_LATE << ALT_CTX_SHIFT | \ ALT_TYPE_FACILITY << ALT_TYPE_SHIFT | \ (facility) << ALT_DATA_SHIFT) diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index ecabdff89bce..de89c9e8b1a3 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -18,9 +18,14 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign if (!(a->ctx & ctx)) continue; switch (a->type) { + case ALT_TYPE_FACILITY_EARLY: + replace = test_facility(a->data); + break; +#ifndef __DECOMPRESSOR case ALT_TYPE_FACILITY: replace = __test_facility(a->data, alt_stfle_fac_list); break; +#endif default: replace = false; } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 5128ccee9c67..975c654cf5a5 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -222,6 +222,8 @@ SECTIONS QUAD(init_mm) QUAD(swapper_pg_dir) QUAD(invalid_pg_dir) + QUAD(__alt_instructions) + QUAD(__alt_instructions_end) #ifdef CONFIG_KASAN QUAD(kasan_early_shadow_page) QUAD(kasan_early_shadow_pte) -- cgit From 47837a5c74f432ad992239cfa5966543f466d4df Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:55 +0200 Subject: s390/nospec: Push down alternative handling The nospec implementation is deeply integrated into the alternatives code: only for nospec an alternative facility list is implemented and used by the alternative code, while it is modified by nospec specific needs. Push down the nospec alternative handling into the nospec by introducing a new alternative type and a specific nospec callback to decide if alternatives should be applied. Also introduce a new global nobp variable which together with facility 82 can be used to decide if nobp is enabled or not. Acked-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/alternative.h | 5 +++++ arch/s390/include/asm/nospec-branch.h | 9 +++++++++ arch/s390/include/asm/processor.h | 2 +- arch/s390/kernel/alternative.c | 4 ++++ arch/s390/kernel/early.c | 2 -- arch/s390/kernel/entry.S | 8 ++++---- arch/s390/kernel/nospec-branch.c | 16 +++++++++------- arch/s390/kernel/nospec-sysfs.c | 2 +- 8 files changed, 33 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 32c208332e57..5f56a2f3aba6 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -33,6 +33,7 @@ #define ALT_TYPE_FACILITY_EARLY 0 #define ALT_TYPE_FACILITY 1 +#define ALT_TYPE_SPEC 2 #define ALT_DATA_SHIFT 0 #define ALT_TYPE_SHIFT 20 @@ -46,6 +47,10 @@ ALT_TYPE_FACILITY << ALT_TYPE_SHIFT | \ (facility) << ALT_DATA_SHIFT) +#define ALT_SPEC(facility) (ALT_CTX_LATE << ALT_CTX_SHIFT | \ + ALT_TYPE_SPEC << ALT_TYPE_SHIFT | \ + (facility) << ALT_DATA_SHIFT) + #ifndef __ASSEMBLY__ #include diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index b9c1f3cae842..192835a3e24d 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -5,8 +5,17 @@ #ifndef __ASSEMBLY__ #include +#include extern int nospec_disable; +extern int nobp; + +static inline bool nobp_enabled(void) +{ + if (__is_defined(__DECOMPRESSOR)) + return false; + return nobp && test_facility(82); +} void nospec_init_branches(void); void nospec_auto_detect(void); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 8a52554f49f0..3063488014eb 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -419,7 +419,7 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) static __always_inline void bpon(void) { - asm volatile(ALTERNATIVE("nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82))); + asm volatile(ALTERNATIVE("nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82))); } #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index de89c9e8b1a3..05545669552f 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -26,6 +27,9 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign replace = __test_facility(a->data, alt_stfle_fac_list); break; #endif + case ALT_TYPE_SPEC: + replace = nobp_enabled(); + break; default: replace = false; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 467ed4dba817..d142598e0532 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -193,8 +193,6 @@ static noinline __init void setup_lowcore_early(void) static noinline __init void setup_facility_list(void) { memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); - if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) - __clear_facility(82, alt_stfle_fac_list); } static __init void detect_diag9c(void) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 90027a57a524..8caf893d1b59 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -100,22 +100,22 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_FACILITY(82) + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_SPEC(82) .endm .macro BPON - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82) + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm .macro BPENTER tif_ptr,tif_mask ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ - "j .+12; nop; nop", ALT_FACILITY(82) + "j .+12; nop; nop", ALT_SPEC(82) .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ - "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82) + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm #if IS_ENABLED(CONFIG_KVM) diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 9b8c24ebb008..e11ec15960a1 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -4,6 +4,8 @@ #include #include +int nobp = IS_ENABLED(CONFIG_KERNEL_NOBP); + static int __init nobp_setup_early(char *str) { bool enabled; @@ -17,11 +19,11 @@ static int __init nobp_setup_early(char *str) * The user explicitly requested nobp=1, enable it and * disable the expoline support. */ - __set_facility(82, alt_stfle_fac_list); + nobp = 1; if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_disable = 1; } else { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } return 0; } @@ -29,7 +31,7 @@ early_param("nobp", nobp_setup_early); static int __init nospec_setup_early(char *str) { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; return 0; } early_param("nospec", nospec_setup_early); @@ -40,7 +42,7 @@ static int __init nospec_report(void) pr_info("Spectre V2 mitigation: etokens\n"); if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) + if (nobp_enabled()) pr_info("Spectre V2 mitigation: limited branch prediction\n"); return 0; } @@ -66,14 +68,14 @@ void __init nospec_auto_detect(void) */ if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. */ nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } /* * If the kernel has not been compiled with expolines the @@ -86,7 +88,7 @@ static int __init spectre_v2_setup_early(char *str) { if (str && !strncmp(str, "on", 2)) { nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } if (str && !strncmp(str, "off", 3)) nospec_disable = 1; diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index 52d4353188ad..a95188818637 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -17,7 +17,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, return sprintf(buf, "Mitigation: etokens\n"); if (nospec_uses_trampoline()) return sprintf(buf, "Mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) + if (nobp_enabled()) return sprintf(buf, "Mitigation: limited branch prediction\n"); return sprintf(buf, "Vulnerable\n"); } -- cgit From beb8cee06f9b8726616ba87783116cb8fb889c7a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Jul 2024 13:50:56 +0200 Subject: s390/alternatives: Remove alternative facility list The alternative and the normal facility list are always identical. Remove the alternative facility list, which allows to simplify the alternatives code. Acked-by: Alexander Gordeev Tested-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/alternative.h | 9 ++++----- arch/s390/include/asm/facility.h | 1 - arch/s390/kernel/alternative.c | 7 +------ arch/s390/kernel/early.c | 6 ------ arch/s390/kernel/setup.c | 1 - 5 files changed, 5 insertions(+), 19 deletions(-) diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 5f56a2f3aba6..3ddd6dbe5635 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -31,16 +31,15 @@ #define ALT_CTX_LATE 2 #define ALT_CTX_ALL (ALT_CTX_EARLY | ALT_CTX_LATE) -#define ALT_TYPE_FACILITY_EARLY 0 -#define ALT_TYPE_FACILITY 1 -#define ALT_TYPE_SPEC 2 +#define ALT_TYPE_FACILITY 0 +#define ALT_TYPE_SPEC 1 #define ALT_DATA_SHIFT 0 #define ALT_TYPE_SHIFT 20 #define ALT_CTX_SHIFT 28 -#define ALT_FACILITY_EARLY(facility) (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ - ALT_TYPE_FACILITY_EARLY << ALT_TYPE_SHIFT | \ +#define ALT_FACILITY_EARLY(facility) (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ + ALT_TYPE_FACILITY << ALT_TYPE_SHIFT | \ (facility) << ALT_DATA_SHIFT) #define ALT_FACILITY(facility) (ALT_CTX_LATE << ALT_CTX_SHIFT | \ diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index d46cc725f024..b7d234838a36 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -20,7 +20,6 @@ #define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8) extern u64 stfle_fac_list[16]; -extern u64 alt_stfle_fac_list[16]; static inline void __set_facility(unsigned long nr, void *facilities) { diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 05545669552f..eae254466192 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -19,14 +19,9 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign if (!(a->ctx & ctx)) continue; switch (a->type) { - case ALT_TYPE_FACILITY_EARLY: - replace = test_facility(a->data); - break; -#ifndef __DECOMPRESSOR case ALT_TYPE_FACILITY: - replace = __test_facility(a->data, alt_stfle_fac_list); + replace = test_facility(a->data); break; -#endif case ALT_TYPE_SPEC: replace = nobp_enabled(); break; diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index d142598e0532..3ce77cee272d 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -190,11 +190,6 @@ static noinline __init void setup_lowcore_early(void) get_lowcore()->preempt_count = INIT_PREEMPT_COUNT; } -static noinline __init void setup_facility_list(void) -{ - memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); -} - static __init void detect_diag9c(void) { unsigned int cpu_address; @@ -289,7 +284,6 @@ void __init startup_init(void) lockdep_off(); sort_amode31_extable(); setup_lowcore_early(); - setup_facility_list(); detect_machine_type(); setup_arch_string(); setup_boot_command_line(); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 178daf4e3563..700003e1bc76 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -155,7 +155,6 @@ unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); u64 __bootdata_preserved(stfle_fac_list[16]); EXPORT_SYMBOL(stfle_fac_list); -u64 alt_stfle_fac_list[16]; struct oldmem_data __bootdata_preserved(oldmem_data); unsigned long VMALLOC_START; -- cgit From 213400c4afd5c89fd8bd17d06addf145f6c8f0d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Jul 2024 15:41:11 +0200 Subject: s390/nmi: Simplify ptregs setup The low level machine check handler code fills the ptregs structure partially with the register contents present at machine check handler entry and partially with contents from the machine check save area. In case of a machine check the contents of all general purpose registers are saved by the CPU to the machine check save area. Therefore simplify the code and fill the ptregs structure by only using the machine check save area as source. Signed-off-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 8caf893d1b59..a72d6494701d 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -491,8 +491,8 @@ SYM_CODE_START(mcck_int_handler) stctg %c1,%c1,__PT_CR1(%r11) lctlg %c1,%c1,__LC_KERNEL_ASCE xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lghi %r14,__LC_GPREGS_SAVE_AREA+64 - stmg %r0,%r7,__PT_R0(%r11) + lghi %r14,__LC_GPREGS_SAVE_AREA + mvc __PT_R0(128,%r11),0(%r14) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -502,7 +502,6 @@ SYM_CODE_START(mcck_int_handler) xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 - mvc __PT_R8(64,%r11),0(%r14) stmg %r8,%r9,__PT_PSW(%r11) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) -- cgit From fc8eac33ad93420b4e51cdd811e12d3fc9b531a5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Jul 2024 15:41:12 +0200 Subject: s390/entry: Move SIE indicator flag to thread info CIF_SIE indicates if a thread is running in SIE context. This is the state of a thread and not the CPU. Therefore move this indicator to thread info. Signed-off-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/processor.h | 2 -- arch/s390/include/asm/thread_info.h | 1 + arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/entry.S | 20 ++++++++++++-------- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 3063488014eb..5ecd442535b9 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -14,13 +14,11 @@ #include -#define CIF_SIE 0 /* CPU needs SIE exit cleanup */ #define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ #define CIF_ENABLED_WAIT 5 /* in enabled wait state */ #define CIF_MCCK_GUEST 6 /* machine check happening in guest */ #define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */ -#define _CIF_SIE BIT(CIF_SIE) #define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY) #define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT) #define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index d02a709717b8..00ac01874a12 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -40,6 +40,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ unsigned int cpu; /* current CPU */ + unsigned char sie; /* running in SIE context */ }; /* diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 58fc6b93b475..ffa0dd2dbaac 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -28,6 +28,7 @@ int main(void) BLANK(); /* thread info offsets */ OFFSET(__TI_flags, task_struct, thread_info.flags); + OFFSET(__TI_sie, task_struct, thread_info.sie); BLANK(); /* pt_regs offsets */ OFFSET(__PT_PSW, pt_regs, psw); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a72d6494701d..df351622c94c 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -123,7 +123,8 @@ _LPP_OFFSET = __LC_LPP lg %r9,\sie_control # get control block pointer ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - ni __LC_CPU_FLAGS+7,255-_CIF_SIE + lg %r9,__LC_CURRENT + mvi __TI_sie(%r9),0 larl %r9,sie_exit # skip forward to sie_exit .endm #endif @@ -183,15 +184,15 @@ SYM_FUNC_END(__switch_to_asm) */ SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers - lg %r12,__LC_CURRENT + lg %r14,__LC_CURRENT stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 - mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags + mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags lmg %r0,%r13,0(%r4) # load guest gprs 0-13 - oi __LC_CPU_FLAGS+7,_CIF_SIE + mvi __TI_sie(%r14),1 lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now @@ -211,7 +212,8 @@ SYM_FUNC_START(__sie64a) lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - ni __LC_CPU_FLAGS+7,255-_CIF_SIE + lg %r14,__LC_CURRENT + mvi __TI_sie(%r14),0 # some program checks are suppressing. C code (e.g. do_protection_exception) # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. @@ -394,7 +396,8 @@ SYM_CODE_START(\name) tmhh %r8,0x0001 # interrupting from user ? jnz 1f #if IS_ENABLED(CONFIG_KVM) - TSTMSK __LC_CPU_FLAGS,_CIF_SIE + lg %r10,__LC_CURRENT + tm __TI_sie(%r10),0xff jz 0f BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST SIEEXIT __SF_SIE_CONTROL(%r15) @@ -469,9 +472,10 @@ SYM_CODE_START(mcck_int_handler) TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic #if IS_ENABLED(CONFIG_KVM) - TSTMSK __LC_CPU_FLAGS,_CIF_SIE + lg %r10,__LC_CURRENT + tm __TI_sie(%r10),0xff jz .Lmcck_user - # Need to compare the address instead of a CIF_SIE* flag. + # Need to compare the address instead of __TI_SIE flag. # Otherwise there would be a race between setting the flag # and entering SIE (or leaving and clearing the flag). This # would cause machine checks targeted at the guest to be -- cgit From 13be21f39ab58184fb91844d2242e33805dda40e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Jul 2024 15:41:13 +0200 Subject: s390/atomic_ops: Disable flag outputs constraint for GCC versions below 14.2.0 GCC may die with an ICE if the flag outputs constraint is used in combination with other inline assemblies. This will be fixed with GCC 14.2.0. Therefore disable the use of the constraint for now. Link: https://gcc.gnu.org/git?p=gcc.git;a=commit;h=cd11413ff7c4353a3e336db415304f788d23a393 Signed-off-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/atomic_ops.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index 2b379d1d9046..742c7919cbcd 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -188,7 +188,8 @@ static __always_inline long __atomic64_cmpxchg(long *ptr, long old, long new) return old; } -#ifdef __GCC_ASM_FLAG_OUTPUTS__ +/* GCC versions before 14.2.0 may die with an ICE in some configurations. */ +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_IS_GCC) && (GCC_VERSION < 140200)) static __always_inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) { -- cgit From 5ade5be4edf855245955108860d2016af3065a37 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:14 +0200 Subject: s390: Add infrastructure to patch lowcore accesses The s390 architecture defines two special per-CPU data pages called the "prefix area". In s390-linux terminology this is usually called "lowcore". This memory area contains system configuration data like old/new PSW's for system call/interrupt/machine check handlers and lots of other data. It is normally mapped to logical address 0. This area can only be accessed when in supervisor mode. This means that kernel code can dereference NULL pointers, because accesses to address 0 are allowed. Parts of lowcore can be write protected, but read accesses and write accesses outside of the write protected areas are not caught. To remove this limitation for debugging and testing, remap lowcore to another address and define a function get_lowcore() which simply returns the address where lowcore is mapped at. This would normally introduce a pointer dereference (=memory read). As lowcore is used for several very often used variables, add code to patch this function during runtime, so we avoid the memory reads. For C code get_lowcore() has to be used, for assembly code it is the GET_LC macro. When using this macro/function a reference is added to alternative patching. All these locations will be patched to the actual lowcore location when the kernel is booted or a module is loaded. To make debugging/bisecting problems easier, this patch adds all the infrastructure but the lowcore address is still hardwired to 0. This way the code can be converted on a per function basis, and the functionality is enabled in a patch after all the functions have been converted. Note that this requires at least z16 because the old lpsw instruction only allowed a 12 bit displacement. z16 introduced lpswey which allows 20 bits (signed), so the lowcore can effectively be mapped from address 0 - 0x7e000. To use 0x7e000 as address, a 6 byte lgfi instruction would have to be used in the alternative. To save two bytes, llilh can be used, but this only allows to set bits 16-31 of the address. In order to use the llilh instruction, use 0x70000 as alternative lowcore address. This is still large enough to catch NULL pointer dereferences into large arrays. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/boot/boot.h | 2 ++ arch/s390/boot/ipl_parm.c | 1 + arch/s390/boot/startup.c | 1 + arch/s390/boot/vmem.c | 11 ++++++++++- arch/s390/include/asm/abs_lowcore.h | 8 ++++++++ arch/s390/include/asm/alternative.h | 4 ++++ arch/s390/include/asm/lowcore.h | 23 ++++++++++++++++++++++- arch/s390/kernel/abs_lowcore.c | 1 + arch/s390/kernel/alternative.c | 4 ++++ arch/s390/kernel/alternative.h | 0 arch/s390/kernel/early.c | 1 + arch/s390/kernel/setup.c | 3 +++ 12 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 arch/s390/kernel/alternative.h diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index ed2f0ec24f0d..83e2ce050b6c 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -91,8 +91,10 @@ extern char _end[], _decompressor_end[]; extern unsigned char _compressed_start[]; extern unsigned char _compressed_end[]; extern struct vmlinux_info _vmlinux_info; + #define vmlinux _vmlinux_info +#define __lowcore_pa(x) ((unsigned long)(x) % sizeof(struct lowcore)) #define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore)) #define __kernel_va(x) ((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset)) #define __kernel_pa(x) ((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index a21f301acd29..337c14931ccb 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index cca2f1bad33c..ce232552bc1c 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -30,6 +30,7 @@ unsigned long __bootdata_preserved(vmemmap_size); unsigned long __bootdata_preserved(MODULES_VADDR); unsigned long __bootdata_preserved(MODULES_END); unsigned long __bootdata_preserved(max_mappable); +int __bootdata_preserved(relocate_lowcore); u64 __bootdata_preserved(stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index a255ca189aaa..2847cc059ab7 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -26,6 +26,7 @@ atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); enum populate_mode { POPULATE_NONE, POPULATE_DIRECT, + POPULATE_LOWCORE, POPULATE_ABS_LOWCORE, POPULATE_IDENTITY, POPULATE_KERNEL, @@ -242,6 +243,8 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m return -1; case POPULATE_DIRECT: return addr; + case POPULATE_LOWCORE: + return __lowcore_pa(addr); case POPULATE_ABS_LOWCORE: return __abs_lowcore_pa(addr); case POPULATE_KERNEL: @@ -418,6 +421,7 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit) { + unsigned long lowcore_address = 0; unsigned long start, end; unsigned long asce_type; unsigned long asce_bits; @@ -455,12 +459,17 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l __arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER); __arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER); + if (relocate_lowcore) + lowcore_address = LOWCORE_ALT_ADDRESS; + /* * To allow prefixing the lowcore must be mapped with 4KB pages. * To prevent creation of a large page at address 0 first map * the lowcore and create the identity mapping only afterwards. */ - pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT); + pgtable_populate(lowcore_address, + lowcore_address + sizeof(struct lowcore), + POPULATE_LOWCORE); for_each_physmem_usable_range(i, &start, &end) { pgtable_populate((unsigned long)__identity_va(start), (unsigned long)__identity_va(end), diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h index 6f264b79e377..d20df8c923fc 100644 --- a/arch/s390/include/asm/abs_lowcore.h +++ b/arch/s390/include/asm/abs_lowcore.h @@ -2,6 +2,7 @@ #ifndef _ASM_S390_ABS_LOWCORE_H #define _ASM_S390_ABS_LOWCORE_H +#include #include #define ABS_LOWCORE_MAP_SIZE (NR_CPUS * sizeof(struct lowcore)) @@ -24,4 +25,11 @@ static inline void put_abs_lowcore(struct lowcore *lc) put_cpu(); } +extern int __bootdata_preserved(relocate_lowcore); + +static inline int have_relocated_lowcore(void) +{ + return relocate_lowcore; +} + #endif /* _ASM_S390_ABS_LOWCORE_H */ diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 3ddd6dbe5635..de980c938a3e 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -33,6 +33,7 @@ #define ALT_TYPE_FACILITY 0 #define ALT_TYPE_SPEC 1 +#define ALT_TYPE_LOWCORE 2 #define ALT_DATA_SHIFT 0 #define ALT_TYPE_SHIFT 20 @@ -50,6 +51,9 @@ ALT_TYPE_SPEC << ALT_TYPE_SHIFT | \ (facility) << ALT_DATA_SHIFT) +#define ALT_LOWCORE (ALT_CTX_EARLY << ALT_CTX_SHIFT | \ + ALT_TYPE_LOWCORE << ALT_TYPE_SHIFT) + #ifndef __ASSEMBLY__ #include diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index bce3a69ab2a3..52c90b65a2b8 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -14,10 +14,15 @@ #include #include #include +#include #define LC_ORDER 1 #define LC_PAGES 2 +#define LOWCORE_ALT_ADDRESS _AC(0x70000, UL) + +#ifndef __ASSEMBLY__ + struct pgm_tdb { u64 data[32]; }; @@ -214,7 +219,14 @@ struct lowcore { static __always_inline struct lowcore *get_lowcore(void) { - return NULL; + struct lowcore *lc; + + if (__is_defined(__DECOMPRESSOR)) + return NULL; + asm(ALTERNATIVE("llilh %[lc],0", "llilh %[lc],%[alt]", ALT_LOWCORE) + : [lc] "=d" (lc) + : [alt] "i" (LOWCORE_ALT_ADDRESS >> 16)); + return lc; } extern struct lowcore *lowcore_ptr[]; @@ -224,4 +236,13 @@ static inline void set_prefix(__u32 address) asm volatile("spx %0" : : "Q" (address) : "memory"); } +#else /* __ASSEMBLY__ */ + +.macro GET_LC reg + ALTERNATIVE "llilh \reg,0", \ + __stringify(llilh \reg, LOWCORE_ALT_ADDRESS >> 16), \ + ALT_LOWCORE +.endm + +#endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_LOWCORE_H */ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c index f9efc54ec4b7..09cd24cbe74e 100644 --- a/arch/s390/kernel/abs_lowcore.c +++ b/arch/s390/kernel/abs_lowcore.c @@ -4,6 +4,7 @@ #include unsigned long __bootdata_preserved(__abs_lowcore); +int __bootdata_preserved(relocate_lowcore); int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) { diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index eae254466192..8d5d0de35de0 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -25,6 +26,9 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign case ALT_TYPE_SPEC: replace = nobp_enabled(); break; + case ALT_TYPE_LOWCORE: + replace = have_relocated_lowcore(); + break; default: replace = false; } diff --git a/arch/s390/kernel/alternative.h b/arch/s390/kernel/alternative.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 3ce77cee272d..14d324865e33 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -48,6 +48,7 @@ decompressor_handled_param(dfltcc); decompressor_handled_param(facilities); decompressor_handled_param(nokaslr); decompressor_handled_param(cmma); +decompressor_handled_param(relocate_lowcore); #if IS_ENABLED(CONFIG_KVM) decompressor_handled_param(prot_virt); #endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 700003e1bc76..4ec99f73fa27 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -889,6 +889,9 @@ void __init setup_arch(char **cmdline_p) else pr_info("Linux is running as a guest in 64-bit mode\n"); + if (have_relocated_lowcore()) + pr_info("Lowcore relocated to 0x%px\n", get_lowcore()); + log_component_list(); /* Have one command line that is parsed and saved in /proc/cmdline */ -- cgit From 39e8c5d6a4ce6512af5178f70c0c5d735141fc10 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:15 +0200 Subject: s390/head64: Make startup code ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in startup_continue(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/head64.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 45413b04efc5..396034b2fe67 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -18,14 +19,15 @@ __HEAD SYM_CODE_START(startup_continue) larl %r1,tod_clock_base - mvc 0(16,%r1),__LC_BOOT_CLOCK + GET_LC %r2 + mvc 0(16,%r1),__LC_BOOT_CLOCK(%r2) # # Setup stack # larl %r14,init_task - stg %r14,__LC_CURRENT + stg %r14,__LC_CURRENT(%r2) larl %r15,init_thread_union+STACK_INIT_OFFSET - stg %r15,__LC_KERNEL_STACK + stg %r15,__LC_KERNEL_STACK(%r2) brasl %r14,sclp_early_adjust_va # allow sclp_early_printk brasl %r14,startup_init # s390 specific early init brasl %r14,start_kernel # common init code -- cgit From 12184a46767b40c1c9b022cd96a9b4019ebd368f Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:16 +0200 Subject: s390/entry: Make __sie64a() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in __sie64a(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index df351622c94c..618b8b774932 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -28,6 +28,7 @@ #include #include #include +#include _LPP_OFFSET = __LC_LPP @@ -184,7 +185,8 @@ SYM_FUNC_END(__switch_to_asm) */ SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers - lg %r14,__LC_CURRENT + GET_LC %r13 + lg %r14,__LC_CURRENT(%r13) stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area @@ -211,8 +213,9 @@ SYM_FUNC_START(__sie64a) .Lsie_skip: lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - lg %r14,__LC_CURRENT + GET_LC %r14 + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r14) # load primary asce + lg %r14,__LC_CURRENT(%r14) mvi __TI_sie(%r14),0 # some program checks are suppressing. C code (e.g. do_protection_exception) # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There -- cgit From ca2f0a26c498c42fcccdf09527e8755481801eea Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:17 +0200 Subject: s390/entry: Add base register to MBEAR macro In preparation of having lowcore at different address than zero, add the base register to MBEAR. No functional change, because %r0 is passed to the macro. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 618b8b774932..0d624045f2a6 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -44,8 +44,9 @@ _LPP_OFFSET = __LC_LPP ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193) .endm - .macro MBEAR reg - ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), ALT_FACILITY(193) + .macro MBEAR reg, lowcore + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK(\lowcore)),\ + ALT_FACILITY(193) .endm .macro CHECK_STACK savearea @@ -282,7 +283,7 @@ SYM_CODE_START(system_call) xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC - MBEAR %r2 + MBEAR %r2,%r0 lgr %r3,%r14 brasl %r14,__do_syscall STACKLEAK_ERASE @@ -424,7 +425,7 @@ SYM_CODE_START(\name) xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC - MBEAR %r11 + MBEAR %r11,%r0 stmg %r8,%r9,__PT_PSW(%r11) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler -- cgit From 6908f8f916f24636621a5b4c300bdf9a0155f07e Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:18 +0200 Subject: s390/entry: Add base register to SIEEXIT macro In preparation of having lowcore at different address than zero, add the base register to SIEEXIT. No functional change, because %r0 is passed to the macro. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 0d624045f2a6..7a5e11d9db8b 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -121,11 +121,11 @@ _LPP_OFFSET = __LC_LPP .endm #if IS_ENABLED(CONFIG_KVM) - .macro SIEEXIT sie_control - lg %r9,\sie_control # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - lg %r9,__LC_CURRENT + .macro SIEEXIT sie_control,lowcore + lg %r9,\sie_control # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE(\lowcore) # load primary asce + lg %r9,__LC_CURRENT(\lowcore) mvi __TI_sie(%r9),0 larl %r9,sie_exit # skip forward to sie_exit .endm @@ -349,7 +349,7 @@ SYM_CODE_START(pgm_check_handler) clc __GMAP_ASCE(8,%r12), __PT_CR1(%r11) jne 5f BPENTER __SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r10) + SIEEXIT __SF_SIE_CONTROL(%r10),%r0 #endif 5: stmg %r8,%r9,__PT_PSW(%r11) # clear user controlled registers to prevent speculative use @@ -404,7 +404,7 @@ SYM_CODE_START(\name) tm __TI_sie(%r10),0xff jz 0f BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r15) + SIEEXIT __SF_SIE_CONTROL(%r15),%r0 #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) @@ -491,7 +491,7 @@ SYM_CODE_START(mcck_int_handler) lg %r10,__LC_PCPU oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST 4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r15) + SIEEXIT __SF_SIE_CONTROL(%r15),%r0 #endif .Lmcck_user: lg %r15,__LC_MCCK_STACK -- cgit From 86e08d64eec35cbe6a85798add4bfc1218ca9513 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:19 +0200 Subject: s390/entry: Add base register to CHECK_VMAP_STACK/CHECK_STACK macro In preparation of having lowcore at different address than zero, add the base register to CHECK_VMAP_STACK and CHECK_STACK. No functional change, because %r0 is passed to the macro. Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 7a5e11d9db8b..1d12f3c29a43 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -49,30 +49,30 @@ _LPP_OFFSET = __LC_LPP ALT_FACILITY(193) .endm - .macro CHECK_STACK savearea + .macro CHECK_STACK savearea, lowcore #ifdef CONFIG_CHECK_STACK tml %r15,THREAD_SIZE - CONFIG_STACK_GUARD - lghi %r14,\savearea + la %r14,\savearea(\lowcore) jz stack_overflow #endif .endm - .macro CHECK_VMAP_STACK savearea,oklabel + .macro CHECK_VMAP_STACK savearea, lowcore, oklabel #ifdef CONFIG_VMAP_STACK lgr %r14,%r15 nill %r14,0x10000 - THREAD_SIZE oill %r14,STACK_INIT_OFFSET - clg %r14,__LC_KERNEL_STACK + clg %r14,__LC_KERNEL_STACK(\lowcore) je \oklabel - clg %r14,__LC_ASYNC_STACK + clg %r14,__LC_ASYNC_STACK(\lowcore) je \oklabel - clg %r14,__LC_MCCK_STACK + clg %r14,__LC_MCCK_STACK(\lowcore) je \oklabel - clg %r14,__LC_NODAT_STACK + clg %r14,__LC_NODAT_STACK(\lowcore) je \oklabel - clg %r14,__LC_RESTART_STACK + clg %r14,__LC_RESTART_STACK(\lowcore) je \oklabel - lghi %r14,\savearea + la %r14,\savearea(\lowcore) j stack_overflow #else j \oklabel @@ -331,10 +331,10 @@ SYM_CODE_START(pgm_check_handler) jnz 2f # -> enabled, can't be a double fault tm __LC_PGM_ILC+3,0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -2: CHECK_STACK __LC_SAVE_AREA_SYNC +2: CHECK_STACK __LC_SAVE_AREA_SYNC,%r0 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f + CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r0,4f 3: lg %r15,__LC_KERNEL_STACK 4: la %r11,STACK_FRAME_OVERHEAD(%r15) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) @@ -406,7 +406,7 @@ SYM_CODE_START(\name) BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST SIEEXIT __SF_SIE_CONTROL(%r15),%r0 #endif -0: CHECK_STACK __LC_SAVE_AREA_ASYNC +0: CHECK_STACK __LC_SAVE_AREA_ASYNC,%r0 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f 1: lctlg %c1,%c1,__LC_KERNEL_ASCE -- cgit From 9e1e275fa28d5896ca7cdf8afa5eb58c0117a303 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:20 +0200 Subject: s390/entry: Make pgm_check_handler() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in pgm_check_handler(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/lowcore.h | 6 ++++++ arch/s390/kernel/entry.S | 47 +++++++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 52c90b65a2b8..183ac29afaf8 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -244,5 +244,11 @@ static inline void set_prefix(__u32 address) ALT_LOWCORE .endm +.macro STMG_LC start, end, savearea + ALTERNATIVE "stmg \start, \end, \savearea", \ + __stringify(stmg \start, \end, LOWCORE_ALT_ADDRESS + \savearea), \ + ALT_LOWCORE +.endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_LOWCORE_H */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 1d12f3c29a43..5f63f3fbb34c 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -40,8 +40,11 @@ _LPP_OFFSET = __LC_LPP ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193) .endm - .macro LPSWEY address,lpswe - ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193) + .macro LPSWEY address, lpswe + ALTERNATIVE_2 "b \lpswe;nopr", \ + ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY_EARLY(193), \ + __stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0), \ + ALT_LOWCORE .endm .macro MBEAR reg, lowcore @@ -317,39 +320,40 @@ SYM_CODE_END(ret_from_fork) */ SYM_CODE_START(pgm_check_handler) - stpt __LC_SYS_ENTER_TIMER + STMG_LC %r8,%r15,__LC_SAVE_AREA_SYNC + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_SYNC lgr %r10,%r15 - lmg %r8,%r9,__LC_PGM_OLD_PSW + lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13) tmhh %r8,0x0001 # coming from user space? jno .Lpgm_skip_asce - lctlg %c1,%c1,__LC_KERNEL_ASCE + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) j 3f # -> fault in user space .Lpgm_skip_asce: 1: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 2f # -> enabled, can't be a double fault - tm __LC_PGM_ILC+3,0x80 # check for per exception + tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -2: CHECK_STACK __LC_SAVE_AREA_SYNC,%r0 +2: CHECK_STACK __LC_SAVE_AREA_SYNC,%r13 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r0,4f -3: lg %r15,__LC_KERNEL_STACK + CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r13,4f +3: lg %r15,__LC_KERNEL_STACK(%r13) 4: la %r11,STACK_FRAME_OVERHEAD(%r15) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC(%r13) + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13) stctg %c1,%c1,__PT_CR1(%r11) #if IS_ENABLED(CONFIG_KVM) - ltg %r12,__LC_GMAP + ltg %r12,__LC_GMAP(%r13) jz 5f clc __GMAP_ASCE(8,%r12), __PT_CR1(%r11) jne 5f BPENTER __SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r10),%r0 + SIEEXIT __SF_SIE_CONTROL(%r10),%r13 #endif 5: stmg %r8,%r9,__PT_PSW(%r11) # clear user controlled registers to prevent speculative use @@ -365,11 +369,11 @@ SYM_CODE_START(pgm_check_handler) tmhh %r8,0x0001 # returning to user space? jno .Lpgm_exit_kernel STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE + lctlg %c1,%c1,__LC_USER_ASCE(%r13) BPON - stpt __LC_EXIT_TIMER + stpt __LC_EXIT_TIMER(%r13) .Lpgm_exit_kernel: - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE @@ -378,11 +382,11 @@ SYM_CODE_START(pgm_check_handler) # single stepped system call # .Lpgm_svcper: - mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW + mvc __LC_RETURN_PSW(8,%r13),__LC_SVC_NEW_PSW(%r13) larl %r14,.Lsysc_per - stg %r14,__LC_RETURN_PSW+8 + stg %r14,__LC_RETURN_PSW+8(%r13) lghi %r14,1 - LBEAR __LC_PGM_LAST_BREAK + LBEAR __LC_PGM_LAST_BREAK(%r13) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per SYM_CODE_END(pgm_check_handler) @@ -596,7 +600,8 @@ SYM_CODE_END(restart_int_handler) * Setup a pt_regs so that show_trace can provide a good call trace. */ SYM_CODE_START(stack_overflow) - lg %r15,__LC_NODAT_STACK # change to panic stack + GET_LC %r15 + lg %r15,__LC_NODAT_STACK(%r15) # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) -- cgit From bd2c55b307f77fbf19d76250672266ff06f4a324 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:21 +0200 Subject: s390/entry: Make int handlers ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in the ext/io interrupt handlers. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 5f63f3fbb34c..5c303eff08e0 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -395,26 +395,27 @@ SYM_CODE_END(pgm_check_handler) */ .macro INT_HANDLER name,lc_old_psw,handler SYM_CODE_START(\name) - stckf __LC_INT_CLOCK - stpt __LC_SYS_ENTER_TIMER - STBEAR __LC_LAST_BREAK + STMG_LC %r8,%r15,__LC_SAVE_AREA_ASYNC + GET_LC %r13 + stckf __LC_INT_CLOCK(%r13) + stpt __LC_SYS_ENTER_TIMER(%r13) + STBEAR __LC_LAST_BREAK(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lmg %r8,%r9,\lc_old_psw + lmg %r8,%r9,\lc_old_psw(%r13) tmhh %r8,0x0001 # interrupting from user ? jnz 1f #if IS_ENABLED(CONFIG_KVM) - lg %r10,__LC_CURRENT + lg %r10,__LC_CURRENT(%r13) tm __TI_sie(%r10),0xff jz 0f BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r15),%r0 + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -0: CHECK_STACK __LC_SAVE_AREA_ASYNC,%r0 +0: CHECK_STACK __LC_SAVE_AREA_ASYNC,%r13 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f -1: lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK +1: lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) + lg %r15,__LC_KERNEL_STACK(%r13) 2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -428,18 +429,18 @@ SYM_CODE_START(\name) xgr %r7,%r7 xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC - MBEAR %r11,%r0 + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC(%r13) + MBEAR %r11,%r13 stmg %r8,%r9,__PT_PSW(%r11) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + mvc __LC_RETURN_PSW(16,%r13),__PT_PSW(%r11) tmhh %r8,0x0001 # returning to user ? jno 2f STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE + lctlg %c1,%c1,__LC_USER_ASCE(%r13) BPON - stpt __LC_EXIT_TIMER + stpt __LC_EXIT_TIMER(%r13) 2: LBEAR __PT_LAST_BREAK(%r11) lmg %r0,%r15,__PT_R0(%r11) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE -- cgit From 0001b7bbc53aeb8d31f650701d2a55e498634a2d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:22 +0200 Subject: s390/entry: Make mchk_int_handler() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in mcck_int_handler(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 5c303eff08e0..a855f901f6e6 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -455,33 +455,34 @@ INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq */ SYM_CODE_START(mcck_int_handler) BPOFF - lmg %r8,%r9,__LC_MCK_OLD_PSW - TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE + GET_LC %r13 + lmg %r8,%r9,__LC_MCK_OLD_PSW(%r13) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CR_VALID jno .Lmcck_panic # control registers invalid -> panic ptlb - lghi %r14,__LC_CPU_TIMER_SAVE_AREA - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID + lay %r14,__LC_CPU_TIMER_SAVE_AREA(%r13) + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CPU_TIMER_VALID jo 3f - la %r14,__LC_SYS_ENTER_TIMER - clc 0(8,%r14),__LC_EXIT_TIMER + la %r14,__LC_SYS_ENTER_TIMER(%r13) + clc 0(8,%r14),__LC_EXIT_TIMER(%r13) jl 1f - la %r14,__LC_EXIT_TIMER -1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER + la %r14,__LC_EXIT_TIMER(%r13) +1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER(%r13) jl 2f - la %r14,__LC_LAST_UPDATE_TIMER + la %r14,__LC_LAST_UPDATE_TIMER(%r13) 2: spt 0(%r14) - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) -3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) +3: TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? jnz .Lmcck_user - TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic #if IS_ENABLED(CONFIG_KVM) - lg %r10,__LC_CURRENT + lg %r10,__LC_CURRENT(%r13) tm __TI_sie(%r10),0xff jz .Lmcck_user # Need to compare the address instead of __TI_SIE flag. @@ -496,15 +497,15 @@ SYM_CODE_START(mcck_int_handler) lg %r10,__LC_PCPU oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST 4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r15),%r0 + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif .Lmcck_user: - lg %r15,__LC_MCCK_STACK + lg %r15,__LC_MCCK_STACK(%r13) la %r11,STACK_FRAME_OVERHEAD(%r15) stctg %c1,%c1,__PT_CR1(%r11) - lctlg %c1,%c1,__LC_KERNEL_ASCE + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lghi %r14,__LC_GPREGS_SAVE_AREA + lay %r14,__LC_GPREGS_SAVE_AREA(%r13) mvc __PT_R0(128,%r11),0(%r14) # clear user controlled registers to prevent speculative use xgr %r0,%r0 @@ -522,12 +523,13 @@ SYM_CODE_START(mcck_int_handler) brasl %r14,s390_do_machine_check lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) - mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW - tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? + mvc __LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW + tm __LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ? jno 0f BPON - stpt __LC_EXIT_TIMER -0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA), ALT_FACILITY(193) + stpt __LC_EXIT_TIMER(%r13) +0: ALTERNATIVE "brcl 0,0", __stringify(lay %r12,__LC_LAST_BREAK_SAVE_AREA(%r13)),\ + ALT_FACILITY(193) LBEAR 0(%r12) lmg %r11,%r15,__PT_R11(%r11) LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE -- cgit From 4064b711127e1eb5b5fd42d539fd45e3e33c9b6f Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:23 +0200 Subject: s390/entry: Make restart_int_handler() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in restart_int_handler(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a855f901f6e6..ca58b3da3916 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -573,15 +573,17 @@ SYM_CODE_START(restart_int_handler) 0: larl %r15,daton_psw lpswe 0(%r15) # turn dat on, keep irqs off .Ldaton: - lg %r15,__LC_RESTART_STACK + GET_LC %r15 + lg %r15,__LC_RESTART_STACK(%r15) xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART - mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW + GET_LC %r13 + mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART(%r13) + mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW(%r13) xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) - lg %r1,__LC_RESTART_FN # load fn, parm & source cpu - lg %r2,__LC_RESTART_DATA - lgf %r3,__LC_RESTART_SOURCE + lg %r1,__LC_RESTART_FN(%r13) # load fn, parm & source cpu + lg %r2,__LC_RESTART_DATA(%r13) + lgf %r3,__LC_RESTART_SOURCE(%r13) ltgr %r3,%r3 # test source cpu address jm 1f # negative -> skip source stop 0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu -- cgit From 7cc86dee44a47d961fd6195fd91f75ce176b992d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:24 +0200 Subject: s390/entry: Make __switch_to() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in __switch_to(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index ca58b3da3916..bbdbe3c3a770 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -169,13 +169,14 @@ SYM_FUNC_START(__switch_to_asm) stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev lg %r15,0(%r4,%r3) # start of kernel stack of next agr %r15,%r5 # end of kernel stack of next - stg %r3,__LC_CURRENT # store task struct of next - stg %r15,__LC_KERNEL_STACK # store end of kernel stack + GET_LC %r13 + stg %r3,__LC_CURRENT(%r13) # store task struct of next + stg %r15,__LC_KERNEL_STACK(%r13) # store end of kernel stack lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next aghi %r3,__TASK_pid - mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next + mvc __LC_CURRENT_PID(4,%r13),0(%r3) # store pid of next + ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40) lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40) BR_EX %r14 SYM_FUNC_END(__switch_to_asm) -- cgit From 9b3dcae128f8803950d646329f2301cae3fe8f4d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:25 +0200 Subject: s390/entry: Make ret_from_fork() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in ret_from_fork(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index bbdbe3c3a770..2bd9ef24ace3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -307,12 +307,13 @@ SYM_CODE_START(ret_from_fork) lgr %r3,%r11 brasl %r14,__ret_from_fork STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + GET_LC %r13 + lctlg %c1,%c1,__LC_USER_ASCE(%r13) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE SYM_CODE_END(ret_from_fork) -- cgit From 361f6ec2fe203760353c708480099e0325295b21 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:26 +0200 Subject: s390/entry: Make system_call() ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in system_call(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 2bd9ef24ace3..749410cfdbc0 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -264,14 +264,15 @@ EXPORT_SYMBOL(sie_exit) */ SYM_CODE_START(system_call) - stpt __LC_SYS_ENTER_TIMER - stmg %r8,%r15,__LC_SAVE_AREA_SYNC + STMG_LC %r8,%r15,__LC_SAVE_AREA_SYNC + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF lghi %r14,0 .Lsysc_per: - STBEAR __LC_LAST_BREAK - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK + STBEAR __LC_LAST_BREAK(%r13) + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) + lg %r15,__LC_KERNEL_STACK(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) # clear user controlled register to prevent speculative use @@ -286,17 +287,17 @@ SYM_CODE_START(system_call) xgr %r10,%r10 xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs - mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC - MBEAR %r2,%r0 + mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC(%r13) + MBEAR %r2,%r13 lgr %r3,%r14 brasl %r14,__do_syscall STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lctlg %c1,%c1,__LC_USER_ASCE(%r13) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE SYM_CODE_END(system_call) -- cgit From 97cee3dd4a07413a4175e247f550a4931d39cee1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:27 +0200 Subject: s390/kdump: Make kdump ready for lowcore relocation In preparation of having lowcore at different address than zero, add the base register to all lowcore accesses in store_status() and __do_machine_kdump(). Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/machine_kexec.c | 2 +- arch/s390/kernel/reipl.S | 26 +++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index f4cf65da6d49..8f681ccfb83a 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -62,7 +62,7 @@ static void __do_machine_kdump(void *data) * This need to be done *after* s390_reset_system set the * prefix register of this CPU to zero */ - memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA), + memcpy(absolute_pointer(get_lowcore()->floating_pt_save_area), phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512); call_nodat(1, int, purgatory, int, 1); diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 88087a32ebc6..69fcaf54d5ca 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -9,6 +9,7 @@ #include #include #include +#include GEN_BR_THUNK %r9 @@ -20,20 +21,15 @@ # r3 = Parameter for function # SYM_CODE_START(store_status) - /* Save register one and load save area base */ - stg %r1,__LC_SAVE_AREA_RESTART + STMG_LC %r0,%r15,__LC_GPREGS_SAVE_AREA /* General purpose registers */ - lghi %r1,__LC_GPREGS_SAVE_AREA - stmg %r0,%r15,0(%r1) - mvc 8(8,%r1),__LC_SAVE_AREA_RESTART + GET_LC %r13 /* Control registers */ - lghi %r1,__LC_CREGS_SAVE_AREA - stctg %c0,%c15,0(%r1) + stctg %c0,%c15,__LC_CREGS_SAVE_AREA(%r13) /* Access registers */ - lghi %r1,__LC_AREGS_SAVE_AREA - stam %a0,%a15,0(%r1) + stamy %a0,%a15,__LC_AREGS_SAVE_AREA(%r13) /* Floating point registers */ - lghi %r1,__LC_FPREGS_SAVE_AREA + lay %r1,__LC_FPREGS_SAVE_AREA(%r13) std %f0, 0x00(%r1) std %f1, 0x08(%r1) std %f2, 0x10(%r1) @@ -51,21 +47,21 @@ SYM_CODE_START(store_status) std %f14,0x70(%r1) std %f15,0x78(%r1) /* Floating point control register */ - lghi %r1,__LC_FP_CREG_SAVE_AREA + lay %r1,__LC_FP_CREG_SAVE_AREA(%r13) stfpc 0(%r1) /* CPU timer */ - lghi %r1,__LC_CPU_TIMER_SAVE_AREA + lay %r1,__LC_CPU_TIMER_SAVE_AREA(%r13) stpt 0(%r1) /* Store prefix register */ - lghi %r1,__LC_PREFIX_SAVE_AREA + lay %r1,__LC_PREFIX_SAVE_AREA(%r13) stpx 0(%r1) /* Clock comparator - seven bytes */ - lghi %r1,__LC_CLOCK_COMP_SAVE_AREA larl %r4,clkcmp stckc 0(%r4) + lay %r1,__LC_CLOCK_COMP_SAVE_AREA(%r13) mvc 1(7,%r1),1(%r4) /* Program status word */ - lghi %r1,__LC_PSW_SAVE_AREA + lay %r1,__LC_PSW_SAVE_AREA(%r13) epsw %r4,%r5 st %r4,0(%r1) st %r5,4(%r1) -- cgit From 8f1e70adb1a3ecb982bb6c475209b080bf985074 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 22 Jul 2024 15:41:28 +0200 Subject: s390/boot: Add cmdline option to relocate lowcore Now that everything has been converted, add the option 'relocate_lowcore' to enable relocating the lowcore. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/boot/ipl_parm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 337c14931ccb..1773b72a6a7b 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -311,5 +311,7 @@ void parse_boot_command_line(void) prot_virt_host = 1; } #endif + if (!strcmp(param, "relocate_lowcore") && test_facility(193)) + relocate_lowcore = 1; } } -- cgit From 6dc2e98d5f1de162d1777aee97e59d75d70d07c5 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 4 Jul 2024 11:02:46 +0000 Subject: s390: Remove protvirt and kvm config guards for uv code Removing the CONFIG_PROTECTED_VIRTUALIZATION_GUEST ifdefs and config option as well as CONFIG_KVM ifdefs in uv files. Having this configurable has been more of a pain than a help. It's time to remove the ifdefs and the config option. Signed-off-by: Janosch Frank Acked-by: Christian Borntraeger Acked-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 11 ----------- arch/s390/boot/Makefile | 3 +-- arch/s390/boot/uv.c | 8 -------- arch/s390/boot/uv.h | 13 ------------- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/s390/include/asm/page.h | 2 -- arch/s390/include/asm/uv.h | 32 -------------------------------- arch/s390/kernel/Makefile | 3 +-- arch/s390/kernel/uv.c | 35 +++++++++++++++-------------------- drivers/s390/char/Kconfig | 2 +- 11 files changed, 18 insertions(+), 93 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 59e0d861e26f..a822f952f64a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -799,17 +799,6 @@ config HAVE_PNETID menu "Virtualization" -config PROTECTED_VIRTUALIZATION_GUEST - def_bool n - prompt "Protected virtualization guest support" - help - Select this option, if you want to be able to run this - kernel as a protected virtualization KVM guest. - Protected virtualization capable machines have a mini hypervisor - located at machine level (an ultravisor). With help of the - Ultravisor, KVM will be able to run "protected" VMs, special - VMs whose memory and management data are unavailable to KVM. - config PFAULT def_bool y prompt "Pseudo page fault support" diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 5d8cb7e3b096..4f476884d340 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -39,8 +39,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o -obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o +obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o uv.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index 1e66d2cbb096..318e6ba95bfd 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -8,12 +8,8 @@ #include "uv.h" /* will be used in arch/s390/kernel/uv.c */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); -#endif -#if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); -#endif struct uv_info __bootdata_preserved(uv_info); void uv_query_info(void) @@ -53,14 +49,11 @@ void uv_query_info(void) uv_info.max_secrets = uvcb.max_secrets; } -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) prot_virt_guest = 1; -#endif } -#if IS_ENABLED(CONFIG_KVM) unsigned long adjust_to_uv_max(unsigned long limit) { if (is_prot_virt_host() && uv_info.max_sec_stor_addr) @@ -92,4 +85,3 @@ void sanitize_prot_virt_host(void) { prot_virt_host = is_prot_virt_host_capable(); } -#endif diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h index 0f3070856f8d..da4a4a8d48e0 100644 --- a/arch/s390/boot/uv.h +++ b/arch/s390/boot/uv.h @@ -2,21 +2,8 @@ #ifndef BOOT_UV_H #define BOOT_UV_H -#if IS_ENABLED(CONFIG_KVM) unsigned long adjust_to_uv_max(unsigned long limit); void sanitize_prot_virt_host(void); -#else -static inline unsigned long adjust_to_uv_max(unsigned long limit) -{ - return limit; -} -static inline void sanitize_prot_virt_host(void) {} -#endif - -#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) void uv_query_info(void); -#else -static inline void uv_query_info(void) {} -#endif #endif /* BOOT_UV_H */ diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index f3602414a961..ea63a7342f5f 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -55,7 +55,6 @@ CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m -CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_S390_HYPFS_FS=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index d0d8925fdf09..d8b28ff8ff45 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -53,7 +53,6 @@ CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y CONFIG_VFIO_CCW=m CONFIG_VFIO_AP=m -CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_S390_HYPFS_FS=y diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 5ec41ec3d761..06416b3f94f5 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -174,12 +174,10 @@ static inline int devmem_is_allowed(unsigned long pfn) #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE -#if IS_ENABLED(CONFIG_PGSTE) int arch_make_folio_accessible(struct folio *folio); #define HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE int arch_make_page_accessible(struct page *page); #define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -#endif struct vm_layout { unsigned long kaslr_offset; diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 0679445cac0b..0b5f8f3e84f1 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -414,7 +414,6 @@ static inline bool uv_has_feature(u8 feature_bit) return test_bit_inv(feature_bit, &uv_info.uv_feature_indications); } -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST extern int prot_virt_guest; static inline int is_prot_virt_guest(void) @@ -466,13 +465,6 @@ static inline int uv_remove_shared(unsigned long addr) return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); } -#else -#define is_prot_virt_guest() 0 -static inline int uv_set_shared(unsigned long addr) { return 0; } -static inline int uv_remove_shared(unsigned long addr) { return 0; } -#endif - -#if IS_ENABLED(CONFIG_KVM) extern int prot_virt_host; static inline int is_prot_virt_host(void) @@ -489,29 +481,5 @@ int uv_convert_from_secure_pte(pte_t pte); int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); void setup_uv(void); -#else -#define is_prot_virt_host() 0 -static inline void setup_uv(void) {} - -static inline int uv_pin_shared(unsigned long paddr) -{ - return 0; -} - -static inline int uv_destroy_folio(struct folio *folio) -{ - return 0; -} - -static inline int uv_destroy_pte(pte_t pte) -{ - return 0; -} - -static inline int uv_convert_from_secure_pte(pte_t pte) -{ - return 0; -} -#endif #endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 7241fa194709..e47a4be54ff8 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -43,7 +43,7 @@ obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o extra-y += vmlinux.lds @@ -80,7 +80,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o obj-$(CONFIG_TRACEPOINTS) += trace.o -obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index fa62fa0e369f..36db065c7cf7 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -18,11 +18,22 @@ #include #include +#if !IS_ENABLED(CONFIG_KVM) +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + return 0; +} + +int gmap_fault(struct gmap *gmap, unsigned long gaddr, + unsigned int fault_flags) +{ + return 0; +} +#endif + /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); EXPORT_SYMBOL(prot_virt_guest); -#endif /* * uv_info contains both host and guest information but it's currently only @@ -35,7 +46,6 @@ EXPORT_SYMBOL(prot_virt_guest); struct uv_info __bootdata_preserved(uv_info); EXPORT_SYMBOL(uv_info); -#if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); @@ -543,9 +553,6 @@ int arch_make_page_accessible(struct page *page) return arch_make_folio_accessible(page_folio(page)); } EXPORT_SYMBOL_GPL(arch_make_page_accessible); -#endif - -#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) static ssize_t uv_query_facilities(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -721,24 +728,13 @@ static struct attribute_group uv_query_attr_group = { static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int val = 0; - -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST - val = prot_virt_guest; -#endif - return sysfs_emit(buf, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_guest); } static ssize_t uv_is_prot_virt_host(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int val = 0; - -#if IS_ENABLED(CONFIG_KVM) - val = prot_virt_host; -#endif - - return sysfs_emit(buf, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_host); } static struct kobj_attribute uv_prot_virt_guest = @@ -790,4 +786,3 @@ out_kobj: return rc; } device_initcall(uv_info_init); -#endif diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index 8a03af5ee5b3..80c4e5101c97 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -96,7 +96,7 @@ config SCLP_OFB config S390_UV_UAPI def_tristate m prompt "Ultravisor userspace API" - depends on S390 && (KVM || PROTECTED_VIRTUALIZATION_GUEST) + depends on S390 help Selecting exposes parts of the UV interface to userspace by providing a misc character device at /dev/uv. -- cgit From 62e2397c2203d41c8ebe2cca0c728293ff2af313 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 Jul 2024 04:28:17 +0900 Subject: arm64: remove redundant 'if HAVE_ARCH_KASAN' in Kconfig Since commit 0383808e4d99 ("arm64: kasan: Reduce minimum shadow alignment and enable 5 level paging"), HAVE_ARCH_KASAN is always 'y'. The condition 'if HAVE_ARCH_KASAN' is always met. Signed-off-by: Masahiro Yamada Reviewed-by: Randy Dunlap Acked-by: Mark Rutland Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240715192843.2201439-1-masahiroy@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 167e51067508..5776bf3acd69 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -168,9 +168,9 @@ config ARM64 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN - select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN - select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN - select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE) + select HAVE_ARCH_KASAN_VMALLOC + select HAVE_ARCH_KASAN_SW_TAGS + select HAVE_ARCH_KASAN_HW_TAGS if ARM64_MTE # Some instrumentation may be unsound, hence EXPERT select HAVE_ARCH_KCSAN if EXPERT select HAVE_ARCH_KFENCE -- cgit From add6128fc7f0d24199e977d13ca724f011c03fc2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 16 Jul 2024 10:39:15 +0530 Subject: arm64/Kconfig: Remove redundant 'if HAVE_FUNCTION_GRAPH_TRACER' Since the commit 819e50e25d0c ("arm64: Add ftrace support"), HAVE_FUNCTION_GRAPH_TRACER has always been enabled. Although a subsequent commit 364697032246 ("arm64: ftrace: Enable HAVE_FUNCTION_GRAPH_RETVAL") redundantly added check on HAVE_FUNCTION_GRAPH_TRACER, while enabling the config HAVE_FUNCTION_GRAPH_RETVAL. Let's just drop this redundant check. Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org CC: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20240716050915.2657694-1-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5776bf3acd69..9823bf1ac018 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -211,8 +211,8 @@ config ARM64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_ERROR_INJECTION - select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_GCC_PLUGINS select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \ HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI -- cgit From 0c35e3bd412a6a2676adc35df950cfbb9f464b0c Mon Sep 17 00:00:00 2001 From: Remington Brasga Date: Fri, 12 Jul 2024 23:17:30 +0000 Subject: kselftest: missing arg in ptrace.c The string passed to ksft_test_result_skip is missing the `type_name` Signed-off-by: Remington Brasga Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240712231730.2794-1-rbrasga@uci.edu Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c index 4c941270d8de..e4fa507cbdd0 100644 --- a/tools/testing/selftests/arm64/abi/ptrace.c +++ b/tools/testing/selftests/arm64/abi/ptrace.c @@ -156,7 +156,7 @@ static void test_hw_debug(pid_t child, int type, const char *type_name) /* Zero is not currently architecturally valid */ ksft_test_result(arch, "%s_arch_set\n", type_name); } else { - ksft_test_result_skip("%s_arch_set\n"); + ksft_test_result_skip("%s_arch_set\n", type_name); } } -- cgit From 48f6430505c0b0498ee9020ce3cf9558b1caaaeb Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 18 Jul 2024 10:34:23 -0700 Subject: arm64/vdso: Remove --hash-style=sysv glibc added support for .gnu.hash in 2006 and .hash has been obsoleted for more than one decade in many Linux distributions. Using --hash-style=sysv might imply unaddressed issues and confuse readers. Just drop the option and rely on the linker default, which is likely "both", or "gnu" when the distribution really wants to eliminate sysv hash overhead. Similar to commit 6b7e26547fad ("x86/vdso: Emit a GNU hash"). Signed-off-by: Fangrui Song Link: https://lore.kernel.org/r/20240718173423.1574395-1-maskray@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso/Makefile | 2 +- arch/arm64/kernel/vdso32/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index d63930c82839..d11da6461278 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -21,7 +21,7 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti # potential future proofing if we end up with internal calls to the exported # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so # preparation in build-time C")). -ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ +ldflags-y := -shared -soname=linux-vdso.so.1 \ -Bsymbolic --build-id=sha1 -n $(btildflags-y) ifdef CONFIG_LD_ORPHAN_WARN diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index cc4508c604b2..25a2cb6317f3 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -98,7 +98,7 @@ VDSO_AFLAGS += -D__ASSEMBLY__ # From arm vDSO Makefile VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1 VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096 -VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1 +VDSO_LDFLAGS += -shared --build-id=sha1 VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) -- cgit From f3dfcd25455b1cbb3c7e2d19b0a06acc6c7472a5 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 18 Jul 2024 21:55:32 +0000 Subject: arm64/sysreg: Correct the values for GICv4.1 Currently, sysreg has value as 0b0010 for the presence of GICv4.1 in ID_PFR1_EL1 and ID_AA64PFR0_EL1, instead of 0b0011 as per ARM ARM. Hence, correct them to reflect ARM ARM. Signed-off-by: Raghavendra Rao Ananta Reviewed-by: Zenghui Yu Reviewed-by: Anshuman Khandual Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240718215532.616447-1-rananta@google.com Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index a4c1dd4741a4..7ceaa1e0b4bc 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -149,7 +149,7 @@ Res0 63:32 UnsignedEnum 31:28 GIC 0b0000 NI 0b0001 GICv3 - 0b0010 GICv4p1 + 0b0011 GICv4p1 EndEnum UnsignedEnum 27:24 Virt_frac 0b0000 NI @@ -903,7 +903,7 @@ EndEnum UnsignedEnum 27:24 GIC 0b0000 NI 0b0001 IMP - 0b0010 V4P1 + 0b0011 V4P1 EndEnum SignedEnum 23:20 AdvSIMD 0b0000 IMP -- cgit From 193cc89ea0ca1da311877d2b4bb5e9f03bcc82a2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 21 Jul 2024 15:45:56 -0500 Subject: cifs: fix potential null pointer use in destroy_workqueue in init_cifs error path Dan Carpenter reported a Smack static checker warning: fs/smb/client/cifsfs.c:1981 init_cifs() error: we previously assumed 'serverclose_wq' could be null (see line 1895) The patch which introduced the serverclose workqueue used the wrong oredering in error paths in init_cifs() for freeing it on errors. Fixes: 173217bd7336 ("smb3: retrying on failed server close") Cc: stable@vger.kernel.org Cc: Ritvik Budhiraja Reported-by: Dan Carpenter Reviewed-by: Dan Carpenter Reviewed-by: David Howells Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index c92937bed133..2c4b357d85e2 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1894,12 +1894,12 @@ init_cifs(void) WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!serverclose_wq) { rc = -ENOMEM; - goto out_destroy_serverclose_wq; + goto out_destroy_deferredclose_wq; } rc = cifs_init_inodecache(); if (rc) - goto out_destroy_deferredclose_wq; + goto out_destroy_serverclose_wq; rc = cifs_init_netfs(); if (rc) @@ -1967,6 +1967,8 @@ out_destroy_netfs: cifs_destroy_netfs(); out_destroy_inodecache: cifs_destroy_inodecache(); +out_destroy_serverclose_wq: + destroy_workqueue(serverclose_wq); out_destroy_deferredclose_wq: destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: @@ -1977,8 +1979,6 @@ out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); -out_destroy_serverclose_wq: - destroy_workqueue(serverclose_wq); out_clean_proc: cifs_proc_clean(); return rc; -- cgit From 630482ee0653decf9e2482ac6181897eb6cde5b8 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Tue, 16 Jul 2024 15:55:14 +0300 Subject: iommu: sprd: Avoid NULL deref in sprd_iommu_hw_en In sprd_iommu_cleanup() before calling function sprd_iommu_hw_en() dom->sdev is equal to NULL, which leads to null dereference. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 9afea57384d4 ("iommu/sprd: Release dma buffer to avoid memory leak") Signed-off-by: Artem Chernyshev Reviewed-by: Chunyan Zhang Link: https://lore.kernel.org/r/20240716125522.3690358-1-artem.chernyshev@red-soft.ru Signed-off-by: Will Deacon --- drivers/iommu/sprd-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index ba53571a8239..a2f4ffe6d949 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -232,8 +232,8 @@ static void sprd_iommu_cleanup(struct sprd_iommu_domain *dom) pgt_size = sprd_iommu_pgt_size(&dom->domain); dma_free_coherent(dom->sdev->dev, pgt_size, dom->pgt_va, dom->pgt_pa); - dom->sdev = NULL; sprd_iommu_hw_en(dom->sdev, false); + dom->sdev = NULL; } static void sprd_iommu_domain_free(struct iommu_domain *domain) -- cgit From 86c5eac3c4c4a2ee124d202af9a141bd0457ee68 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 16 Jul 2024 15:25:45 +0800 Subject: iommu/amd: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Fixes: c9b258c6be09 ("iommu/amd: Prepare for generic IO page table framework") Signed-off-by: Chen Ni Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20240716072545.968690-1-nichen@iscas.ac.cn Signed-off-by: Will Deacon --- drivers/iommu/amd/io_pgtable.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 9d9a7fde59e7..1074ee25064d 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -588,9 +588,9 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo { struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, + cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES; + cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; + cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; cfg->tlb = &v1_flush_ops; pgtable->iop.ops.map_pages = iommu_v1_map_pages; -- cgit From 8031b001da700474c11d28629581480b12a0d8d4 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Thu, 18 Jul 2024 16:46:16 +0530 Subject: HID: amd_sfh: Move sensor discovery before HID device initialization Sensors discovery is independent of HID device initialization. If sensor discovery fails after HID initialization, then the HID device needs to be deinitialized. Therefore, sensors discovery should be moved before HID device initialization. Fixes: 7bcfdab3f0c6 ("HID: amd_sfh: if no sensors are enabled, clean up") Tested-by: Aurinko Signed-off-by: Basavaraj Natikar Link: https://patch.msgid.link/20240718111616.3012155-1-Basavaraj.Natikar@amd.com Signed-off-by: Benjamin Tissoires --- drivers/hid/amd-sfh-hid/amd_sfh_client.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c index bdb578e0899f..4b59687ff5d8 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c @@ -288,12 +288,22 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) mp2_ops->start(privdata, info); cl_data->sensor_sts[i] = amd_sfh_wait_for_response (privdata, cl_data->sensor_idx[i], SENSOR_ENABLED); + + if (cl_data->sensor_sts[i] == SENSOR_ENABLED) + cl_data->is_any_sensor_enabled = true; + } + + if (!cl_data->is_any_sensor_enabled || + (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0)) { + dev_warn(dev, "Failed to discover, sensors not enabled is %d\n", + cl_data->is_any_sensor_enabled); + rc = -EOPNOTSUPP; + goto cleanup; } for (i = 0; i < cl_data->num_hid_devices; i++) { cl_data->cur_hid_dev = i; if (cl_data->sensor_sts[i] == SENSOR_ENABLED) { - cl_data->is_any_sensor_enabled = true; rc = amdtp_hid_probe(i, cl_data); if (rc) goto cleanup; @@ -305,12 +315,6 @@ int amd_sfh_hid_client_init(struct amd_mp2_dev *privdata) cl_data->sensor_sts[i]); } - if (!cl_data->is_any_sensor_enabled || - (mp2_ops->discovery_status && mp2_ops->discovery_status(privdata) == 0)) { - dev_warn(dev, "Failed to discover, sensors not enabled is %d\n", cl_data->is_any_sensor_enabled); - rc = -EOPNOTSUPP; - goto cleanup; - } schedule_delayed_work(&cl_data->work_buffer, msecs_to_jiffies(AMD_SFH_IDLE_LOOP)); return 0; -- cgit From f99b3feb3b0e9fca2257c90fc8317be8ee44c19a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 22 Jul 2024 08:33:09 +0200 Subject: clk: samsung: fix getting Exynos4 fin_pll rate from external clocks Commit 0dc83ad8bfc9 ("clk: samsung: Don't register clkdev lookup for the fixed rate clocks") claimed registering clkdev lookup is not necessary anymore, but that was not entirely true: Exynos4210/4212/4412 clock code still relied on it to get the clock rate of xxti or xusbxti external clocks. Drop that requirement by accessing already registered clk_hw when looking up the xxti/xusbxti rate. Reported-by: Artur Weber Closes: https://lore.kernel.org/all/6227c1fb-d769-462a-b79b-abcc15d3db8e@gmail.com/ Fixes: 0dc83ad8bfc9 ("clk: samsung: Don't register clkdev lookup for the fixed rate clocks") Cc: Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240722063309.60054-1-krzysztof.kozlowski@linaro.org Tested-by: Artur Weber # Exynos4212 Signed-off-by: Stephen Boyd --- drivers/clk/samsung/clk-exynos4.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c index a026ccca7315..28945b6b0ee1 100644 --- a/drivers/clk/samsung/clk-exynos4.c +++ b/drivers/clk/samsung/clk-exynos4.c @@ -1040,19 +1040,20 @@ static unsigned long __init exynos4_get_xom(void) static void __init exynos4_clk_register_finpll(struct samsung_clk_provider *ctx) { struct samsung_fixed_rate_clock fclk; - struct clk *clk; - unsigned long finpll_f = 24000000; + unsigned long finpll_f; + unsigned int parent; char *parent_name; unsigned int xom = exynos4_get_xom(); parent_name = xom & 1 ? "xusbxti" : "xxti"; - clk = clk_get(NULL, parent_name); - if (IS_ERR(clk)) { + parent = xom & 1 ? CLK_XUSBXTI : CLK_XXTI; + + finpll_f = clk_hw_get_rate(ctx->clk_data.hws[parent]); + if (!finpll_f) { pr_err("%s: failed to lookup parent clock %s, assuming " "fin_pll clock frequency is 24MHz\n", __func__, parent_name); - } else { - finpll_f = clk_get_rate(clk); + finpll_f = 24000000; } fclk.id = CLK_FIN_PLL; -- cgit From a214384ce26b6111ea8c8d58fa82a1ca63996c38 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 22 Jul 2024 23:40:08 -0500 Subject: cifs: fix reconnect with SMB1 UNIX Extensions When mounting with the SMB1 Unix Extensions (e.g. mounts to Samba with vers=1.0), reconnects no longer reset the Unix Extensions (SetFSInfo SET_FILE_UNIX_BASIC) after tcon so most operations (e.g. stat, ls, open, statfs) will fail continuously with: "Operation not supported" if the connection ever resets (e.g. due to brief network disconnect) Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 7a16e12f5da8..89d9f86cc29a 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3686,6 +3686,7 @@ error: } #endif +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * Issue a TREE_CONNECT request. */ @@ -3807,11 +3808,25 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, else tcon->Flags = 0; cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); - } + /* + * reset_cifs_unix_caps calls QFSInfo which requires + * need_reconnect to be false, but we would not need to call + * reset_caps if this were not a reconnect case so must check + * need_reconnect flag here. The caller will also clear + * need_reconnect when tcon was successful but needed to be + * cleared earlier in the case of unix extensions reconnect + */ + if (tcon->need_reconnect && tcon->unix_ext) { + cifs_dbg(FYI, "resetting caps for %s\n", tcon->tree_name); + tcon->need_reconnect = false; + reset_cifs_unix_caps(xid, tcon, NULL, NULL); + } + } cifs_buf_release(smb_buffer); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static void delayed_free(struct rcu_head *p) { -- cgit From 0e314e452687ce0ec5874e42cdb993a34325d3d2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 23 Jul 2024 00:44:48 -0500 Subject: cifs: mount with "unix" mount option for SMB1 incorrectly handled Although by default we negotiate CIFS Unix Extensions for SMB1 mounts to Samba (and they work if the user does not specify "unix" or "posix" or "linux" on mount), and we do properly handle when a user turns them off with "nounix" mount parm. But with the changes to the mount API we broke cases where the user explicitly specifies the "unix" option (or equivalently "linux" or "posix") on mount with vers=1.0 to Samba or other servers which support the CIFS Unix Extensions. "mount error(95): Operation not supported" and logged: "CIFS: VFS: Check vers= mount option. SMB3.11 disabled but required for POSIX extensions" even though CIFS Unix Extensions are supported for vers=1.0 This patch fixes the case where the user specifies both "unix" (or equivalently "posix" or "linux") and "vers=1.0" on mount to a server which supports the CIFS Unix Extensions. Cc: stable@vger.kernel.org Reviewed-by: David Howells Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 89d9f86cc29a..d2307162a2de 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2614,6 +2614,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n"); rc = -EOPNOTSUPP; goto out_fail; + } else if (ses->server->vals->protocol_id == SMB10_PROT_ID) + if (cap_unix(ses)) + cifs_dbg(FYI, "Unix Extensions requested on SMB1 mount\n"); + else { + cifs_dbg(VFS, "SMB1 Unix Extensions not supported by server\n"); + rc = -EOPNOTSUPP; + goto out_fail; } else { cifs_dbg(VFS, "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n"); -- cgit From ba6c664081afd18da86ac49cb22ceb266f89a561 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Wed, 24 Jul 2024 10:46:55 +0200 Subject: kbuild: rpm-pkg: Fix C locale setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit semicolon separation in LC_ALL is wrong. Either variable needs to be exported before as a separate commit or set as part of the commit in the beginning. Used second variant. This fixes broken build on user's locale setup which makes 'date' binary to produce invalid characters in rpm changelog (e.g. cs_CZ.UTF-8 'čec'): $ make binrpm-pkg GEN rpmbuild/SPECS/kernel.spec rpmbuild -bb rpmbuild/SPECS/kernel.spec --define='_topdirlinux/rpmbuild' \ --target x86_64-linux --build-in-place --noprep --define='_smp_mflags \ %{nil}' $(rpm -q rpm >/dev/null 2>&1 || echo --nodeps) Building target platforms: x86_64-linux Building for target x86_64-linux error: bad date in %changelog: St čec 24 2024 user make[2]: *** [scripts/Makefile.package:71: binrpm-pkg] Error 1 make[1]: *** [linux/Makefile:1546: binrpm-pkg] Error 2 make: *** [Makefile:224: __sub-make] Error 2 Fixes: 301c10908e42 ("kbuild: rpm-pkg: introduce a simple changelog section for kernel.spec") Signed-off-by: Petr Vorel Reviewed-by: Miguel Ojeda Signed-off-by: Masahiro Yamada --- scripts/package/mkspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index ead54d67a024..4dc1466dfc81 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -50,6 +50,6 @@ fi cat << EOF %changelog -* $(LC_ALL=C; date +'%a %b %d %Y') ${name} <${email}> +* $(LC_ALL=C date +'%a %b %d %Y') ${name} <${email}> - Custom built Linux kernel. EOF -- cgit From ae67ed9010a7b52933ad1038d13df8a3aae34b83 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 24 Jul 2024 11:19:31 +0300 Subject: ASoC: SOF: ipc4-topology: Only handle dai_config with HW_PARAMS for ChainDMA The DMA Link ID is only valid in snd_sof_dai_config_data when the dai_config is called with HW_PARAMS. The commit that this patch fixes is actually moved a code section without changing it, the same bug exists in the original code, needing different patch to kernel prior to 6.9 kernels. Cc: stable@vger.kernel.org Fixes: 3858464de57b ("ASoC: SOF: ipc4-topology: change chain_dma handling in dai_config") Link: https://github.com/thesofproject/linux/issues/5116 Signed-off-by: Peter Ujfalusi Reviewed-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20240724081932.24542-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 90f6856ee80c..4a4234d5c941 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -3095,8 +3095,14 @@ static int sof_ipc4_dai_config(struct snd_sof_dev *sdev, struct snd_sof_widget * return 0; if (pipeline->use_chain_dma) { - pipeline->msg.primary &= ~SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK; - pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID(data->dai_data); + /* + * Only configure the DMA Link ID for ChainDMA when this op is + * invoked with SOF_DAI_CONFIG_FLAGS_HW_PARAMS + */ + if (flags & SOF_DAI_CONFIG_FLAGS_HW_PARAMS) { + pipeline->msg.primary &= ~SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK; + pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID(data->dai_data); + } return 0; } -- cgit From e6fc5fcaeffa04a3fa1db8dfccdfd4b6001c0446 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 24 Jul 2024 11:19:32 +0300 Subject: ASoC: SOF: ipc4-topology: Preserve the DMA Link ID for ChainDMA on unprepare The DMA Link ID is set to the IPC message's primary during dai_config, which is only during hw_params. During xrun handling the hw_params is not called and the DMA Link ID information will be lost. All other fields in the message expected to be 0 for re-configuration, only the DMA Link ID needs to be preserved and the in case of repeated dai_config, it is correctly updated (masked and then set). Cc: stable@vger.kernel.org Fixes: ca5ce0caa67f ("ASoC: SOF: ipc4/intel: Add support for chained DMA") Link: https://github.com/thesofproject/linux/issues/5116 Signed-off-by: Peter Ujfalusi Reviewed-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20240724081932.24542-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 4a4234d5c941..87be7f16e8c2 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -1358,7 +1358,13 @@ static void sof_ipc4_unprepare_copier_module(struct snd_sof_widget *swidget) ipc4_copier = dai->private; if (pipeline->use_chain_dma) { - pipeline->msg.primary = 0; + /* + * Preserve the DMA Link ID and clear other bits since + * the DMA Link ID is only configured once during + * dai_config, other fields are expected to be 0 for + * re-configuration + */ + pipeline->msg.primary &= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK; pipeline->msg.extension = 0; } -- cgit From f7c1b0e4ae47e67c6f9af84568a5f4a80638ccd8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Jul 2024 21:01:14 +0200 Subject: thermal: core: Back off when polling thermal zones on errors Commit a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") introduced a polling mechanism by which the thermal core attampts to get a valid temperature value for thermal zones where the .get_temp() callback returns errors to start with (for example, due to initialization ordering woes). However, this polling is carried out periodically ad infinitum and every iteration of it causes a message to be printed to the kernel log which means a lot of log noise on systems where there are thermal zones that never get ready for some reason. It is also not really useful to continuously poll thermal zones that never respond. To address this, modify the thermal core to increase the delay between consecutive thermal zone temperature checks after every check that fails until it reaches a certain maximum value. At that point, the thermal zone in question will be disabled, but user space will be able to reenable it if it believes that the failure is transient. Also change the code to print messages regarding failed temperature checks to the kernel log only twice, once when the thermal zone's .get_temp() callback returns an error for the first time and once when disabling the given thermal zone. In addition, a dev_crit() message will be printed at that point if the given thermal zone contains a critical trip point to notify the system operator about the situation. Fixes: a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid") Link: https://lore.kernel.org/linux-acpi/CAGnHSE=RyPK++UG0-wAtVKgeJxe0uzFYgLxm+RUOKKoQquW=Ow@mail.gmail.com/ Reported-by: Tom Yan Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2962033.e9J7NaK4W3@rjwysocki.net --- drivers/thermal/thermal_core.c | 58 +++++++++++++++++++++++++++++++++++++++--- drivers/thermal/thermal_core.h | 10 +++++--- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 48bf267d090d..95c399f94744 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -288,6 +288,28 @@ static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz, return 0; } +static void thermal_zone_broken_disable(struct thermal_zone_device *tz) +{ + struct thermal_trip_desc *td; + + dev_err(&tz->device, "Unable to get temperature, disabling!\n"); + /* + * This function only runs for enabled thermal zones, so no need to + * check for the current mode. + */ + __thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); + thermal_notify_tz_disable(tz); + + for_each_trip_desc(tz, td) { + if (td->trip.type == THERMAL_TRIP_CRITICAL && + td->trip.temperature > THERMAL_TEMP_INVALID) { + dev_crit(&tz->device, + "Disabled thermal zone with critical trip point\n"); + return; + } + } +} + /* * Zone update section: main control loop applied to each zone while monitoring * in polling mode. The monitoring is done using a workqueue. @@ -308,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, cancel_delayed_work(&tz->poll_queue); } +static void thermal_zone_recheck(struct thermal_zone_device *tz, int error) +{ + if (error == -EAGAIN) { + thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY); + return; + } + + /* + * Print the message once to reduce log noise. It will be followed by + * another one if the temperature cannot be determined after multiple + * attempts. + */ + if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY) + dev_info(&tz->device, "Temperature check failed (%d)\n", error); + + thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies); + + tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL); + if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) { + thermal_zone_broken_disable(tz); + /* + * Restore the original recheck delay value to allow the thermal + * zone to try to recover when it is reenabled by user space. + */ + tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; + } +} + static void monitor_thermal_zone(struct thermal_zone_device *tz) { if (tz->mode != THERMAL_DEVICE_ENABLED) @@ -507,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, ret = __thermal_zone_get_temp(tz, &temp); if (ret) { - if (ret != -EAGAIN) - dev_info(&tz->device, "Temperature check failed (%d)\n", ret); - - thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS)); + thermal_zone_recheck(tz, ret); return; } else if (temp <= THERMAL_TEMP_INVALID) { /* @@ -522,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, goto monitor; } + tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; + tz->last_temperature = tz->temperature; tz->temperature = temp; @@ -1462,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type, thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay); thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay); + tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; /* sys I/F */ /* Add nodes that are always present via .groups */ diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index ba8e6fc807ca..4cf2b7230d04 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -67,6 +67,8 @@ struct thermal_governor { * @polling_delay_jiffies: number of jiffies to wait between polls when * checking whether trip points have been crossed (0 for * interrupt driven systems) + * @recheck_delay_jiffies: delay after a failed attempt to determine the zone + * temperature before trying again * @temperature: current temperature. This is only for core code, * drivers should use thermal_zone_get_temp() to get the * current temperature @@ -108,6 +110,7 @@ struct thermal_zone_device { int num_trips; unsigned long passive_delay_jiffies; unsigned long polling_delay_jiffies; + unsigned long recheck_delay_jiffies; int temperature; int last_temperature; int emul_temperature; @@ -137,10 +140,11 @@ struct thermal_zone_device { #define THERMAL_TEMP_INIT INT_MIN /* - * Default delay after a failing thermal zone temperature check before - * attempting to check it again. + * Default and maximum delay after a failed thermal zone temperature check + * before attempting to check it again (in jiffies). */ -#define THERMAL_RECHECK_DELAY_MS 250 +#define THERMAL_RECHECK_DELAY msecs_to_jiffies(250) +#define THERMAL_MAX_RECHECK_DELAY (120 * HZ) /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) -- cgit From 3308172276db5d070975d9a321283fcd423ce465 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 13 Jun 2024 19:13:47 +0800 Subject: trace: riscv: Remove deprecated kprobe on ftrace support Since commit 7caa9765465f60 ("ftrace: riscv: move from REGS to ARGS"), kprobe on ftrace is not supported by riscv, because riscv's support for FTRACE_WITH_REGS has been replaced with support for FTRACE_WITH_ARGS, and KPROBES_ON_FTRACE will be supplanted by FPROBES. So remove the deprecated kprobe on ftrace support, which is misunderstood. Signed-off-by: Jinjie Ruan Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20240613111347.1745379-1-ruanjinjie@huawei.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 - arch/riscv/kernel/probes/Makefile | 1 - arch/riscv/kernel/probes/ftrace.c | 65 --------------------------------------- 3 files changed, 67 deletions(-) delete mode 100644 arch/riscv/kernel/probes/ftrace.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 45eafb3306e9..34ae39ecb898 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -154,7 +154,6 @@ config RISCV select HAVE_KERNEL_UNCOMPRESSED if !XIP_KERNEL && !EFI_ZBOOT select HAVE_KERNEL_ZSTD if !XIP_KERNEL && !EFI_ZBOOT select HAVE_KPROBES if !XIP_KERNEL - select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL select HAVE_KRETPROBES if !XIP_KERNEL # https://github.com/ClangBuiltLinux/linux/issues/1881 select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile index 8265ff497977..d2129f2c61b8 100644 --- a/arch/riscv/kernel/probes/Makefile +++ b/arch/riscv/kernel/probes/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o simulate-insn.o obj-$(CONFIG_RETHOOK) += rethook.o rethook_trampoline.o -obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o simulate-insn.o CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c deleted file mode 100644 index a69dfa610aa8..000000000000 --- a/arch/riscv/kernel/probes/ftrace.c +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include - -/* Ftrace callback handler for kprobes -- called under preepmt disabled */ -void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) -{ - struct kprobe *p; - struct pt_regs *regs; - struct kprobe_ctlblk *kcb; - int bit; - - if (unlikely(kprobe_ftrace_disabled)) - return; - - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) - return; - - p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) - goto out; - - regs = ftrace_get_regs(fregs); - kcb = get_kprobe_ctlblk(); - if (kprobe_running()) { - kprobes_inc_nmissed_count(p); - } else { - unsigned long orig_ip = instruction_pointer(regs); - - instruction_pointer_set(regs, ip); - - __this_cpu_write(current_kprobe, p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) { - /* - * Emulate singlestep (and also recover regs->pc) - * as if there is a nop - */ - instruction_pointer_set(regs, - (unsigned long)p->addr + MCOUNT_INSN_SIZE); - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - instruction_pointer_set(regs, orig_ip); - } - - /* - * If pre_handler returns !0, it changes regs->pc. We have to - * skip emulating post_handler. - */ - __this_cpu_write(current_kprobe, NULL); - } -out: - ftrace_test_recursion_unlock(bit); -} -NOKPROBE_SYMBOL(kprobe_ftrace_handler); - -int arch_prepare_kprobe_ftrace(struct kprobe *p) -{ - p->ainsn.api.insn = NULL; - return 0; -} -- cgit From 8d22d0db5bbcec95c70e025dce3d00821da8be7d Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 13 Jun 2024 23:30:53 +0800 Subject: riscv: boot: remove duplicated targets line The "targets:" is duplicated in another line, remove the one with less targets. Signed-off-by: Jisheng Zhang Reviewed-by: Emil Renner Berthing Link: https://lore.kernel.org/r/20240613153053.3835-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index 869c0345b908..4e9e7a28bf9b 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -18,7 +18,6 @@ OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S OBJCOPYFLAGS_loader.bin :=-O binary OBJCOPYFLAGS_xipImage :=-O binary -R .note -R .note.gnu.build-id -R .comment -S -targets := Image Image.* loader loader.o loader.lds loader.bin targets := Image Image.* loader loader.o loader.lds loader.bin xipImage ifeq ($(CONFIG_XIP_KERNEL),y) -- cgit From f8b632e89a101dae349a7b212c1771d7925f441b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:16 +0100 Subject: io_uring: tighten task exit cancellations io_uring_cancel_generic() should retry if any state changes like a request is completed, however in case of a task exit it only goes for another loop and avoids schedule() if any tracked (i.e. REQ_F_INFLIGHT) request got completed. Let's assume we have a non-tracked request executing in iowq and a tracked request linked to it. Let's also assume io_uring_cancel_generic() fails to find and cancel the request, i.e. via io_run_local_work(), which may happen as io-wq has gaps. Next, the request logically completes, io-wq still hold a ref but queues it for completion via tw, which happens in io_uring_try_cancel_requests(). After, right before prepare_to_wait() io-wq puts the request, grabs the linked one and tries executes it, e.g. arms polling. Finally the cancellation loop calls prepare_to_wait(), there are no tw to run, no tracked request was completed, so the tctx_inflight() check passes and the task is put to indefinite sleep. Cc: stable@vger.kernel.org Fixes: 3f48cf18f886c ("io_uring: unify files and task cancel") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/acac7311f4e02ce3c43293f8f1fda9c705d158f1.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8e6faa942a6f..10c409e56241 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3031,8 +3031,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) bool loop = false; io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); + inflight = tctx_inflight(tctx, false); if (!inflight) break; -- cgit From bd44d7e902c2b34c217d3b48874b079760ca7b6e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:17 +0100 Subject: io_uring: don't allow netpolling with SETUP_IOPOLL IORING_SETUP_IOPOLL rings don't have any netpoll handling, let's fail attempts to register netpolling in this case, there might be people who will mix up IOPOLL and netpoll. Cc: stable@vger.kernel.org Fixes: ef1186c1a875b ("io_uring: add register/unregister napi function") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e7553aee0a8ae4edec6742cd6dd0c1e6914fba8.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/napi.c b/io_uring/napi.c index 762254a7ff3f..327e5f3a8abe 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -222,6 +222,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) }; struct io_uring_napi napi; + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) -- cgit From e142e9cd8891b0c6f277ac2c2c254199a6aa56e3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:18 +0100 Subject: io_uring: fix io_match_task must_hold The __must_hold annotation in io_match_task() uses a non existing parameter "req", fix it. Fixes: 6af3f48bf6156 ("io_uring: fix link traversal locking") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3e65ee7709e96507cef3d93291746f2c489f2307.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 1c9bf07499b1..9973876d91b0 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -639,7 +639,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) - __must_hold(&req->ctx->timeout_lock) + __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; -- cgit From f1dcdfcadb0c8c13dddd931c1f4dc58e54fdc9c0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:19 +0100 Subject: io_uring: simplify io_uring_cmd return We don't have to return error code from an op handler back to core io_uring, so once io_uring_cmd() sets the results and handles errors we can juts return IOU_OK and simplify the code. Note, only valid with e0b23d9953b0c ("io_uring: optimise ltimeout for inline execution"), there was a problem with iopoll before. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8eae2be5b2a49236cd5f1dadbd1aa5730e9e2d4f.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a54163a83968..8391c7c7c1ec 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return ret < 0 ? ret : IOU_OK; + return IOU_OK; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, -- cgit From a2b72b81fb3ba18717fc000949ca9d45a3351130 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:20 +0100 Subject: io_uring: kill REQ_F_CANCEL_SEQ We removed the reliance on the flag by the cancellation path and now it's unused. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e57afe566bbe4fefeb44daffb08900f2a4756577.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3bb6198d1523..e62aa9f0629f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -461,7 +461,6 @@ enum { REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, - REQ_F_CANCEL_SEQ_BIT, REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, @@ -536,8 +535,6 @@ enum { REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), - /* cancel sequence is set and valid */ - REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), /* file is pollable */ REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), /* buffer list was empty after selection of buffer */ -- cgit From 29d63b94036e561a016ec8878b44aad6650d23e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:21 +0100 Subject: io_uring: align iowq and task request error handling There is a difference in how io_queue_sqe and io_wq_submit_work treat error codes they get from io_issue_sqe. The first one fails anything unknown but latter only fails when the code is negative. It doesn't make sense to have this discrepancy, align them to the io_queue_sqe behaviour. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c550e152bf4a290187f91a4322ddcb5d6d1f2c73.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 10c409e56241..2626424f5d73 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1849,7 +1849,7 @@ fail: } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret < 0) + if (ret) io_req_task_queue_fail(req, ret); } -- cgit From 38738947db38520b58b7dae64bd0eec513e83139 Mon Sep 17 00:00:00 2001 From: Sia Jee Heng Date: Thu, 2 May 2024 00:37:51 -0700 Subject: RISC-V: ACPI: Enable SPCR table for console output on RISC-V The ACPI SPCR code has been used to enable console output for ARM64 and X86. The same code can be reused for RISC-V. Furthermore, SPCR table is mandated for headless system as outlined in the RISC-V BRS Specification, chapter 6. Signed-off-by: Sia Jee Heng Reviewed-by: Sunil V L Link: https://lore.kernel.org/r/20240502073751.102093-2-jeeheng.sia@starfivetech.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/kernel/acpi.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b94176e25be1..287c17daa9d5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -14,6 +14,7 @@ config RISCV def_bool y select ACPI_GENERIC_GSI if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI + select ACPI_SPCR_TABLE if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index e619edc8b0cc..43a12c00ae8b 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -17,7 +17,9 @@ #include #include #include +#include #include +#include int acpi_noirq = 1; /* skip ACPI IRQ initialization */ int acpi_disabled = 1; @@ -131,7 +133,7 @@ void __init acpi_boot_table_init(void) if (param_acpi_off || (!param_acpi_on && !param_acpi_force && efi.acpi20 == EFI_INVALID_TABLE_ADDR)) - return; + goto done; /* * ACPI is disabled at this point. Enable it in order to parse @@ -151,6 +153,14 @@ void __init acpi_boot_table_init(void) if (!param_acpi_force) disable_acpi(); } + +done: + if (acpi_disabled) { + if (earlycon_acpi_spcr_enable) + early_init_dt_scan_chosen_stdout(); + } else { + acpi_parse_spcr(earlycon_acpi_spcr_enable, true); + } } static int acpi_parse_madt_rintc(union acpi_subtable_headers *header, const unsigned long end) -- cgit From ee3fab10cb1566562aa683f319066eaeecccf918 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Mon, 17 Jun 2024 21:14:23 +0800 Subject: riscv: cacheinfo: remove the useless input parameter (node) of ci_leaf_init() ci_leaf_init() is a declared static function. The implementation of the function body and the caller do not use the parameter (struct device_node *node) input parameter, so remove it. Fixes: 6a24915145c9 ("Revert "riscv: Set more data to cacheinfo"") Signed-off-by: Yunhui Cui Reviewed-by: Jeremy Linton Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20240617131425.7526-1-cuiyunhui@bytedance.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 09e9b88110d1..30a6878287ad 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -64,7 +64,6 @@ uintptr_t get_cache_geometry(u32 level, enum cache_type type) } static void ci_leaf_init(struct cacheinfo *this_leaf, - struct device_node *node, enum cache_type type, unsigned int level) { this_leaf->level = level; @@ -80,11 +79,11 @@ int populate_cache_leaves(unsigned int cpu) int levels = 1, level = 1; if (of_property_read_bool(np, "cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); if (of_property_read_bool(np, "i-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); if (of_property_read_bool(np, "d-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); prev = np; while ((np = of_find_next_cache_node(np))) { @@ -97,11 +96,11 @@ int populate_cache_leaves(unsigned int cpu) if (level <= levels) break; if (of_property_read_bool(np, "cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); if (of_property_read_bool(np, "i-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); if (of_property_read_bool(np, "d-cache-size")) - ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); levels = level; } of_node_put(np); -- cgit From 604f32ea6909b0ebb8ab0bf1ab7dc66ee3dc8955 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Mon, 17 Jun 2024 21:14:24 +0800 Subject: riscv: cacheinfo: initialize cacheinfo's level and type from ACPI PPTT Before cacheinfo can be built correctly, we need to initialize level and type. Since RISC-V currently does not have a register group that describes cache-related attributes like ARM64, we cannot obtain them directly, so now we obtain cache leaves from the ACPI PPTT table (acpi_get_cache_info()) and set the cache type through split_levels. Suggested-by: Jeremy Linton Suggested-by: Sudeep Holla Reviewed-by: Conor Dooley Reviewed-by: Sunil V L Reviewed-by: Jeremy Linton Reviewed-by: Sudeep Holla Signed-off-by: Yunhui Cui Link: https://lore.kernel.org/r/20240617131425.7526-2-cuiyunhui@bytedance.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 30a6878287ad..d6c108c50cba 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -3,6 +3,7 @@ * Copyright (C) 2017 SiFive */ +#include #include #include #include @@ -78,6 +79,27 @@ int populate_cache_leaves(unsigned int cpu) struct device_node *prev = NULL; int levels = 1, level = 1; + if (!acpi_disabled) { + int ret, fw_levels, split_levels; + + ret = acpi_get_cache_info(cpu, &fw_levels, &split_levels); + if (ret) + return ret; + + BUG_ON((split_levels > fw_levels) || + (split_levels + fw_levels > this_cpu_ci->num_leaves)); + + for (; level <= this_cpu_ci->num_levels; level++) { + if (level <= split_levels) { + ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); + ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + } else { + ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); + } + } + return 0; + } + if (of_property_read_bool(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); if (of_property_read_bool(np, "i-cache-size")) -- cgit From 66381d36771e4ffedbea0dd23da17cf3d38e9aae Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Mon, 17 Jun 2024 21:14:25 +0800 Subject: RISC-V: Select ACPI PPTT drivers After adding ACPI support to populate_cache_leaves(), RISC-V can build cacheinfo through the ACPI PPTT table, thus enabling the ACPI_PPTT configuration. Signed-off-by: Yunhui Cui Reviewed-by: Jeremy Linton Reviewed-by: Sudeep Holla Reviewed-by: Sunil V L Link: https://lore.kernel.org/r/20240617131425.7526-3-cuiyunhui@bytedance.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b94176e25be1..15f062e1d5ee 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -13,6 +13,7 @@ config 32BIT config RISCV def_bool y select ACPI_GENERIC_GSI if ACPI + select ACPI_PPTT if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION -- cgit From 39705a6c29f8a2b93cf5b99528a55366c50014d1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 24 Jul 2024 14:49:01 +0200 Subject: landlock: Don't lose track of restrictions on cred_transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a process' cred struct is replaced, this _almost_ always invokes the cred_prepare LSM hook; but in one special case (when KEYCTL_SESSION_TO_PARENT updates the parent's credentials), the cred_transfer LSM hook is used instead. Landlock only implements the cred_prepare hook, not cred_transfer, so KEYCTL_SESSION_TO_PARENT causes all information on Landlock restrictions to be lost. This basically means that a process with the ability to use the fork() and keyctl() syscalls can get rid of all Landlock restrictions on itself. Fix it by adding a cred_transfer hook that does the same thing as the existing cred_prepare hook. (Implemented by having hook_cred_prepare() call hook_cred_transfer() so that the two functions are less likely to accidentally diverge in the future.) Cc: stable@kernel.org Fixes: 385975dca53e ("landlock: Set up the security framework and manage credentials") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20240724-landlock-houdini-fix-v1-1-df89a4560ca3@google.com Signed-off-by: Mickaël Salaün --- security/landlock/cred.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/security/landlock/cred.c b/security/landlock/cred.c index 786af18c4a1c..db9fe7d906ba 100644 --- a/security/landlock/cred.c +++ b/security/landlock/cred.c @@ -14,8 +14,8 @@ #include "ruleset.h" #include "setup.h" -static int hook_cred_prepare(struct cred *const new, - const struct cred *const old, const gfp_t gfp) +static void hook_cred_transfer(struct cred *const new, + const struct cred *const old) { struct landlock_ruleset *const old_dom = landlock_cred(old)->domain; @@ -23,6 +23,12 @@ static int hook_cred_prepare(struct cred *const new, landlock_get_ruleset(old_dom); landlock_cred(new)->domain = old_dom; } +} + +static int hook_cred_prepare(struct cred *const new, + const struct cred *const old, const gfp_t gfp) +{ + hook_cred_transfer(new, old); return 0; } @@ -36,6 +42,7 @@ static void hook_cred_free(struct cred *const cred) static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), + LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), LSM_HOOK_INIT(cred_free, hook_cred_free), }; -- cgit From cc374782b6ca0fd634482391da977542443d3368 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Wed, 24 Jul 2024 16:54:26 +0200 Subject: selftests/landlock: Add cred_transfer test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check that keyctl(KEYCTL_SESSION_TO_PARENT) preserves the parent's restrictions. Fixes: e1199815b47b ("selftests/landlock: Add user space tests") Co-developed-by: Jann Horn Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20240724.Ood5aige9she@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/base_test.c | 74 ++++++++++++++++++++++++++++ tools/testing/selftests/landlock/config | 1 + 2 files changed, 75 insertions(+) diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 3c1e9f35b531..3b26bf3cf5b9 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include @@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer) ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); } +TEST(cred_transfer) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + }; + int ruleset_fd, dir_fd; + pid_t child; + int status; + + drop_caps(_metadata); + + dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + EXPECT_LE(0, dir_fd); + EXPECT_EQ(0, close(dir_fd)); + + /* Denies opening directories. */ + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); + + /* Checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + /* Needed for KEYCTL_SESSION_TO_PARENT permission checks */ + EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0, + 0, 0)) + { + TH_LOG("Failed to join session keyring: %s", strerror(errno)); + } + + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + /* Checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + /* + * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a + * different session keyring in the child, so make that happen. + */ + EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, + NULL, 0, 0, 0)); + + /* + * KEYCTL_SESSION_TO_PARENT installs credentials on the parent + * that never go through the cred_prepare hook, this path uses + * cred_transfer instead. + */ + EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0, + 0, 0, 0)); + + /* Re-checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); + + _exit(_metadata->exit_code); + return; + } + + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* Re-checks ruleset enforcement. */ + EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + EXPECT_EQ(EACCES, errno); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config index 0086efaa7b68..29af19c4e9f9 100644 --- a/tools/testing/selftests/landlock/config +++ b/tools/testing/selftests/landlock/config @@ -2,6 +2,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y CONFIG_INET=y CONFIG_IPV6=y +CONFIG_KEYS=y CONFIG_NET=y CONFIG_NET_NS=y CONFIG_OVERLAY_FS=y -- cgit From 7e04da2dc7013af50ed3a2beb698d5168d1e594b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 24 Jul 2024 15:04:12 +0800 Subject: block: fix deadlock between sd_remove & sd_release Our test report the following hung task: [ 2538.459400] INFO: task "kworker/0:0":7 blocked for more than 188 seconds. [ 2538.459427] Call trace: [ 2538.459430] __switch_to+0x174/0x338 [ 2538.459436] __schedule+0x628/0x9c4 [ 2538.459442] schedule+0x7c/0xe8 [ 2538.459447] schedule_preempt_disabled+0x24/0x40 [ 2538.459453] __mutex_lock+0x3ec/0xf04 [ 2538.459456] __mutex_lock_slowpath+0x14/0x24 [ 2538.459459] mutex_lock+0x30/0xd8 [ 2538.459462] del_gendisk+0xdc/0x350 [ 2538.459466] sd_remove+0x30/0x60 [ 2538.459470] device_release_driver_internal+0x1c4/0x2c4 [ 2538.459474] device_release_driver+0x18/0x28 [ 2538.459478] bus_remove_device+0x15c/0x174 [ 2538.459483] device_del+0x1d0/0x358 [ 2538.459488] __scsi_remove_device+0xa8/0x198 [ 2538.459493] scsi_forget_host+0x50/0x70 [ 2538.459497] scsi_remove_host+0x80/0x180 [ 2538.459502] usb_stor_disconnect+0x68/0xf4 [ 2538.459506] usb_unbind_interface+0xd4/0x280 [ 2538.459510] device_release_driver_internal+0x1c4/0x2c4 [ 2538.459514] device_release_driver+0x18/0x28 [ 2538.459518] bus_remove_device+0x15c/0x174 [ 2538.459523] device_del+0x1d0/0x358 [ 2538.459528] usb_disable_device+0x84/0x194 [ 2538.459532] usb_disconnect+0xec/0x300 [ 2538.459537] hub_event+0xb80/0x1870 [ 2538.459541] process_scheduled_works+0x248/0x4dc [ 2538.459545] worker_thread+0x244/0x334 [ 2538.459549] kthread+0x114/0x1bc [ 2538.461001] INFO: task "fsck.":15415 blocked for more than 188 seconds. [ 2538.461014] Call trace: [ 2538.461016] __switch_to+0x174/0x338 [ 2538.461021] __schedule+0x628/0x9c4 [ 2538.461025] schedule+0x7c/0xe8 [ 2538.461030] blk_queue_enter+0xc4/0x160 [ 2538.461034] blk_mq_alloc_request+0x120/0x1d4 [ 2538.461037] scsi_execute_cmd+0x7c/0x23c [ 2538.461040] ioctl_internal_command+0x5c/0x164 [ 2538.461046] scsi_set_medium_removal+0x5c/0xb0 [ 2538.461051] sd_release+0x50/0x94 [ 2538.461054] blkdev_put+0x190/0x28c [ 2538.461058] blkdev_release+0x28/0x40 [ 2538.461063] __fput+0xf8/0x2a8 [ 2538.461066] __fput_sync+0x28/0x5c [ 2538.461070] __arm64_sys_close+0x84/0xe8 [ 2538.461073] invoke_syscall+0x58/0x114 [ 2538.461078] el0_svc_common+0xac/0xe0 [ 2538.461082] do_el0_svc+0x1c/0x28 [ 2538.461087] el0_svc+0x38/0x68 [ 2538.461090] el0t_64_sync_handler+0x68/0xbc [ 2538.461093] el0t_64_sync+0x1a8/0x1ac T1: T2: sd_remove del_gendisk __blk_mark_disk_dead blk_freeze_queue_start ++q->mq_freeze_depth bdev_release mutex_lock(&disk->open_mutex) sd_release scsi_execute_cmd blk_queue_enter wait_event(!q->mq_freeze_depth) mutex_lock(&disk->open_mutex) SCSI does not set GD_OWNS_QUEUE, so QUEUE_FLAG_DYING is not set in this scenario. This is a classic ABBA deadlock. To fix the deadlock, make sure we don't try to acquire disk->open_mutex after freezing the queue. Cc: stable@vger.kernel.org Fixes: eec1be4c30df ("block: delete partitions later in del_gendisk") Signed-off-by: Yang Yang Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Fixes: and Cc: stable tags are missing. Otherwise this patch looks fine Link: https://lore.kernel.org/r/20240724070412.22521-1-yang.yang@vivo.com Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 4dc95a463505..1c05dd4c6980 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -663,12 +663,12 @@ void del_gendisk(struct gendisk *disk) */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); - __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); + __blk_mark_disk_dead(disk); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); -- cgit From 55fbb9a5d64e0e590cad5eacc16c99f2482a008f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2024 22:33:11 +0800 Subject: ublk: fix UBLK_CMD_DEL_DEV_ASYNC handling In ublk_ctrl_uring_cmd(), ioctl command NR should be used for matching _IOC_NR(cmd_op). Fix it by adding one private macro, and this way is clean. Fixes: 13fe8e6825e4 ("ublk: add UBLK_CMD_DEL_DEV_ASYNC") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240724143311.2646330-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6fb799ec0d88..890c08792ba8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,6 +48,9 @@ #define UBLK_MINORS (1U << MINORBITS) +/* private ioctl command mirror */ +#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -2903,7 +2906,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_DEL_DEV: ret = ublk_ctrl_del_dev(&ub, true); break; - case UBLK_U_CMD_DEL_DEV_ASYNC: + case UBLK_CMD_DEL_DEV_ASYNC: ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: -- cgit From e6e18021ddd0dc5af487fb86b6d7c964e062d692 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Tue, 23 Jul 2024 13:12:24 +1200 Subject: ALSA: hda/realtek: cs35l41: Fixup remaining asus strix models Adjust quirks for 0x3a20, 0x3a30, 0x3a50 to match the 0x3a60. This set has now been confirmed to work with this patch. Signed-off-by: Luke D. Jones Fixes: 811dd426a9b1 ("ALSA: hda/realtek: Add quirks for Asus ROG 2024 laptops using CS35L41") Cc: Link: https://patch.msgid.link/20240723011224.115579-1-luke@ljones.dev Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c3a86a99f8c6..462194bf6954 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10359,10 +10359,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1f62, "ASUS UX7602ZM", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1f92, "ASUS ROG Flow X16", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), - SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), + SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), - SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), -- cgit From ff9fbcafbaf13346c742c0d672a22f5ac20b9d92 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:51 +0200 Subject: selftests/hid: fix bpf_wq new API Since commit f56f4d541eab ("bpf: helpers: fix bpf_wq_set_callback_impl signature"), the API for bpf_wq changed a bit. We need to update the selftests/hid code to reflect that or the bpf program will not load. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-1-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/progs/hid.c | 2 +- tools/testing/selftests/hid/progs/hid_bpf_helpers.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index ee9bbbcf751b..5ecc845ef792 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -455,7 +455,7 @@ struct { __type(value, struct elem); } hmap SEC(".maps"); -static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +static int wq_cb_sleepable(void *map, int *key, void *work) { __u8 buf[9] = {2, 3, 4, 5, 6, 7, 8, 9, 10}; struct hid_bpf_ctx *hid_ctx; diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index cfe37f491906..e5db897586bb 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -114,7 +114,7 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + int (callback_fn)(void *map, int *key, void *wq), unsigned int flags__k, void *aux__ign) __ksym; #define bpf_wq_set_callback(timer, cb, flags) \ bpf_wq_set_callback_impl(timer, cb, flags, NULL) -- cgit From f64c1a4593391c57accf32693a14ef45f8162b5c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:52 +0200 Subject: selftests/hid: disable struct_ops auto-attach Since commit 08ac454e258e ("libbpf: Auto-attach struct_ops BPF maps in BPF skeleton"), libbpf automatically calls bpf_map__attach_struct_ops() on every struct_ops it sees in the bpf object. The problem is that our test bpf object has many of them but only one should be manually loaded at a time, or we end up locking the syscall. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-2-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index dc0408a831d0..9c935fd0dffc 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -532,6 +532,7 @@ static void load_programs(const struct test_program programs[], FIXTURE_DATA(hid_bpf) * self, const FIXTURE_VARIANT(hid_bpf) * variant) { + struct bpf_map *iter_map; int err = -EINVAL; ASSERT_LE(progs_count, ARRAY_SIZE(self->hid_links)) @@ -564,6 +565,13 @@ static void load_programs(const struct test_program programs[], *ops_hid_id = self->hid_id; } + /* we disable the auto-attach feature of all maps because we + * only want the tested one to be manually attached in the next + * call to bpf_map__attach_struct_ops() + */ + bpf_object__for_each_map(iter_map, *self->skel->skeleton->obj) + bpf_map__set_autoattach(iter_map, false); + err = hid__load(self->skel); ASSERT_OK(err) TH_LOG("hid_skel_load failed: %d", err); -- cgit From acd34cfc48b3dd46e5e4c4bdc99cc0c15568bac0 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:53 +0200 Subject: HID: bpf: prevent the same struct_ops to be attached more than once If the struct_ops is already attached, we should bail out or we will end up in various locks and pointer issues while unregistering. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-3-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- drivers/hid/bpf/hid_bpf_struct_ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/bpf/hid_bpf_struct_ops.c b/drivers/hid/bpf/hid_bpf_struct_ops.c index f59cce6e437f..cd696c59ba0f 100644 --- a/drivers/hid/bpf/hid_bpf_struct_ops.c +++ b/drivers/hid/bpf/hid_bpf_struct_ops.c @@ -183,6 +183,10 @@ static int hid_bpf_reg(void *kdata, struct bpf_link *link) struct hid_device *hdev; int count, err = 0; + /* prevent multiple attach of the same struct_ops */ + if (ops->hdev) + return -EINVAL; + hdev = hid_get_device(ops->hid_id); if (IS_ERR(hdev)) return PTR_ERR(hdev); @@ -248,6 +252,7 @@ static void hid_bpf_unreg(void *kdata, struct bpf_link *link) list_del_rcu(&ops->list); synchronize_srcu(&hdev->bpf.srcu); + ops->hdev = NULL; reconnect = hdev->bpf.rdesc_ops == ops; if (reconnect) -- cgit From facdbdfe0e6202d74758387ae9189c39f7b4b16c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 23 Jul 2024 18:21:54 +0200 Subject: selftests/hid: add test for attaching multiple time the same struct_ops Turns out that we would en up in a bad state if we attempt to attach twice the same HID-BPF struct_ops, so have a test for it. Link: https://patch.msgid.link/20240723-fix-6-11-bpf-v1-4-b9d770346784@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 9c935fd0dffc..75b7b4ef6cfa 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -694,6 +694,24 @@ TEST_F(hid_bpf, subprog_raw_event) ASSERT_EQ(buf[2], 52); } +/* + * Attach hid_first_event to the given uhid device, + * attempt at re-attaching it, we should not lock and + * return an invalid struct bpf_link + */ +TEST_F(hid_bpf, multiple_attach) +{ + const struct test_program progs[] = { + { .name = "hid_first_event" }, + }; + struct bpf_link *link; + + LOAD_PROGRAMS(progs); + + link = bpf_map__attach_struct_ops(self->skel->maps.first_event); + ASSERT_NULL(link) TH_LOG("unexpected return value when re-attaching the struct_ops"); +} + /* * Ensures that we can attach/detach programs */ -- cgit From 33be0cfa5ba522ba88ba25cb95e582932843409b Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 28 Jun 2024 17:37:12 +0200 Subject: apparmor: take nosymfollow flag into account A "nosymfollow" flag was added in commit dab741e0e02b ("Add a "nosymfollow" mount option.") While we don't need to implement any special logic on the AppArmor kernel side to handle it, we should provide user with a correct list of mount flags in audit logs. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Georgia Garcia Signed-off-by: John Johansen --- security/apparmor/mount.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 49fe8da6fea4..bf8863253e07 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -44,6 +44,8 @@ static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags) audit_log_format(ab, ", mand"); if (flags & MS_DIRSYNC) audit_log_format(ab, ", dirsync"); + if (flags & MS_NOSYMFOLLOW) + audit_log_format(ab, ", nosymfollow"); if (flags & MS_NOATIME) audit_log_format(ab, ", noatime"); if (flags & MS_NODIRATIME) -- cgit From 4b954a025591a1c7d3a0c0111b6d4730596046b6 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 29 May 2024 18:21:39 -0700 Subject: apparmor: test: add MODULE_DESCRIPTION() Fix the 'make W=1' warning: WARNING: modpost: missing MODULE_DESCRIPTION() in security/apparmor/apparmor_policy_unpack_test.o Signed-off-by: Jeff Johnson Signed-off-by: John Johansen --- security/apparmor/policy_unpack_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 5c9bde25e56d..874fcf97794e 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -604,4 +604,5 @@ static struct kunit_suite apparmor_policy_unpack_test_module = { kunit_test_suite(apparmor_policy_unpack_test_module); +MODULE_DESCRIPTION("KUnit tests for AppArmor's policy unpack"); MODULE_LICENSE("GPL"); -- cgit From f4fee216df7d28b87d1c9cc60bcebfecb51c1a05 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Jun 2024 19:15:27 +0200 Subject: apparmor: try to avoid refing the label in apparmor_file_open If the label is not stale (which is the common case), the fact that the passed file object holds a reference can be leverged to avoid the ref/unref cycle. Doing so reduces performance impact of apparmor on parallel open() invocations. When benchmarking on a 24-core vm using will-it-scale's open1_process ("Separate file open"), the results are (ops/s): before: 6092196 after: 8309726 (+36%) Signed-off-by: Mateusz Guzik Signed-off-by: John Johansen --- security/apparmor/include/cred.h | 20 ++++++++++++++++++++ security/apparmor/lsm.c | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index 58fdc72af664..7265d2f81dd5 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -63,6 +63,26 @@ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) return aa_get_newest_label(aa_cred_raw_label(cred)); } +static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred, + bool *needput) +{ + struct aa_label *l = aa_cred_raw_label(cred); + + if (unlikely(label_is_stale(l))) { + *needput = true; + return aa_get_newest_label(l); + } + + *needput = false; + return l; +} + +static inline void aa_put_label_condref(struct aa_label *l, bool needput) +{ + if (unlikely(needput)) + aa_put_label(l); +} + /** * aa_current_raw_label - find the current tasks confining label * diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 717204fba938..242d4cf857a7 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -461,6 +461,7 @@ static int apparmor_file_open(struct file *file) struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; + bool needput; if (!path_mediated_fs(file->f_path.dentry)) return 0; @@ -477,7 +478,7 @@ static int apparmor_file_open(struct file *file) return 0; } - label = aa_get_newest_cred_label(file->f_cred); + label = aa_get_newest_cred_label_condref(file->f_cred, &needput); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -494,7 +495,7 @@ static int apparmor_file_open(struct file *file) /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } - aa_put_label(label); + aa_put_label_condref(label, needput); return error; } -- cgit From e0ff0cff1f6cdce0aa596aac04129893201c4162 Mon Sep 17 00:00:00 2001 From: Georgia Garcia Date: Mon, 10 Jun 2024 09:51:48 -0300 Subject: apparmor: unpack transition table if dfa is not present Due to a bug in earlier userspaces, a transition table may be present even when the dfa is not. Commit 7572fea31e3e ("apparmor: convert fperm lookup to use accept as an index") made the verification check more rigourous regressing old userspaces with the bug. For compatibility reasons allow the orphaned transition table during unpack and discard. Fixes: 7572fea31e3e ("apparmor: convert fperm lookup to use accept as an index") Signed-off-by: Georgia Garcia Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 42 +++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 75452acd0e35..5a570235427d 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -747,34 +747,42 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, *info = "missing required dfa"; goto fail; } - goto out; + } else { + /* + * only unpack the following if a dfa is present + * + * sadly start was given different names for file and policydb + * but since it is optional we can try both + */ + if (!aa_unpack_u32(e, &pdb->start[0], "start")) + /* default start state */ + pdb->start[0] = DFA_START; + if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) { + /* default start state for xmatch and file dfa */ + pdb->start[AA_CLASS_FILE] = DFA_START; + } /* setup class index */ + for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { + pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0], + i); + } } /* - * only unpack the following if a dfa is present - * - * sadly start was given different names for file and policydb - * but since it is optional we can try both + * Unfortunately due to a bug in earlier userspaces, a + * transition table may be present even when the dfa is + * not. For compatibility reasons unpack and discard. */ - if (!aa_unpack_u32(e, &pdb->start[0], "start")) - /* default start state */ - pdb->start[0] = DFA_START; - if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) { - /* default start state for xmatch and file dfa */ - pdb->start[AA_CLASS_FILE] = DFA_START; - } /* setup class index */ - for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { - pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0], - i); - } if (!unpack_trans_table(e, &pdb->trans) && required_trans) { *info = "failed to unpack profile transition table"; goto fail; } + if (!pdb->dfa && pdb->trans.table) + aa_free_str_table(&pdb->trans); + /* TODO: move compat mapping here, requires dfa merging first */ /* TODO: move verify here, it has to be done after compat mappings */ -out: + *policy = pdb; return 0; -- cgit From a03ebf116303e5d13ba9a2b65726b106cb1e96f6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 9 Jul 2024 17:54:11 -0400 Subject: drm/amdgpu/sdma5.2: Update wptr registers as well as doorbell We seem to have a case where SDMA will sometimes miss a doorbell if GFX is entering the powergating state when the doorbell comes in. To workaround this, we can update the wptr via MMIO, however, this is only safe because we disallow gfxoff in begin_ring() for SDMA 5.2 and then allow it again in end_ring(). Enable this workaround while we are root causing the issue with the HW team. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/3440 Tested-by: Friedrich Vock Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org (cherry picked from commit f2ac52634963fc38e4935e11077b6f7854e5d700) --- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index cc9e961f0078..af1e90159ce3 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -176,6 +176,14 @@ static void sdma_v5_2_ring_set_wptr(struct amdgpu_ring *ring) DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n", ring->doorbell_index, ring->wptr << 2); WDOORBELL64(ring->doorbell_index, ring->wptr << 2); + /* SDMA seems to miss doorbells sometimes when powergating kicks in. + * Updating the wptr directly will wake it. This is only safe because + * we disallow gfxoff in begin_use() and then allow it again in end_use(). + */ + WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR), + lower_32_bits(ring->wptr << 2)); + WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR_HI), + upper_32_bits(ring->wptr << 2)); } else { DRM_DEBUG("Not using doorbell -- " "mmSDMA%i_GFX_RB_WPTR == 0x%08x " @@ -1647,6 +1655,10 @@ static void sdma_v5_2_ring_begin_use(struct amdgpu_ring *ring) * but it shouldn't hurt for other parts since * this GFXOFF will be disallowed anyway when SDMA is * active, this just makes it explicit. + * sdma_v5_2_ring_set_wptr() takes advantage of this + * to update the wptr because sometimes SDMA seems to miss + * doorbells when entering PG. If you remove this, update + * sdma_v5_2_ring_set_wptr() as well! */ amdgpu_gfx_off_ctrl(adev, false); } -- cgit From 73048bda46c3085df5fd42840de09523386d3e54 Mon Sep 17 00:00:00 2001 From: David Belanger Date: Mon, 10 Jun 2024 16:38:55 -0400 Subject: drm/amdgpu: Fix atomics on GFX12 If PCIe supports atomics, configure register to prevent DF from breaking atomics in separate load/store operations. Signed-off-by: David Belanger Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 666f14cab21b17ccc1bdfe1e82458aa429b3b7e0) --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_df.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 5 +++ drivers/gpu/drm/amd/amdgpu/df_v4_15.c | 45 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/df_v4_15.h | 30 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/soc24.c | 4 ++ .../drm/amd/include/asic_reg/df/df_4_15_offset.h | 28 ++++++++++++++ .../drm/amd/include/asic_reg/df/df_4_15_sh_mask.h | 28 ++++++++++++++ 8 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_15.c create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_15.h create mode 100644 drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h create mode 100644 drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 9dd8294032ef..38408e4e158e 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -106,7 +106,8 @@ amdgpu-y += \ df_v1_7.o \ df_v3_6.o \ df_v4_3.o \ - df_v4_6_2.o + df_v4_6_2.o \ + df_v4_15.o # add GMC block amdgpu-y += \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h index 1538b2dbfff1..eb605e79ae0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h @@ -33,6 +33,7 @@ struct amdgpu_df_hash_status { struct amdgpu_df_funcs { void (*sw_init)(struct amdgpu_device *adev); void (*sw_fini)(struct amdgpu_device *adev); + void (*hw_init)(struct amdgpu_device *adev); void (*enable_broadcast_mode)(struct amdgpu_device *adev, bool enable); u32 (*get_fb_channel_number)(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index b241f61fe9c9..ac108fca64fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -37,6 +37,7 @@ #include "df_v3_6.h" #include "df_v4_3.h" #include "df_v4_6_2.h" +#include "df_v4_15.h" #include "nbio_v6_1.h" #include "nbio_v7_0.h" #include "nbio_v7_4.h" @@ -2803,6 +2804,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev) case IP_VERSION(4, 6, 2): adev->df.funcs = &df_v4_6_2_funcs; break; + case IP_VERSION(4, 15, 0): + case IP_VERSION(4, 15, 1): + adev->df.funcs = &df_v4_15_funcs; + break; default: break; } diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_15.c b/drivers/gpu/drm/amd/amdgpu/df_v4_15.c new file mode 100644 index 000000000000..2a573e33908b --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/df_v4_15.c @@ -0,0 +1,45 @@ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "amdgpu.h" +#include "df_v4_15.h" + +#include "df/df_4_15_offset.h" +#include "df/df_4_15_sh_mask.h" + +static void df_v4_15_hw_init(struct amdgpu_device *adev) +{ + if (adev->have_atomics_support) { + uint32_t tmp; + uint32_t dis_lcl_proc = (1 << 1 | + 1 << 2 | + 1 << 13); + + tmp = RREG32_SOC15(DF, 0, regNCSConfigurationRegister1); + tmp |= (dis_lcl_proc << NCSConfigurationRegister1__DisIntAtomicsLclProcessing__SHIFT); + WREG32_SOC15(DF, 0, regNCSConfigurationRegister1, tmp); + } +} + +const struct amdgpu_df_funcs df_v4_15_funcs = { + .hw_init = df_v4_15_hw_init +}; diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_15.h b/drivers/gpu/drm/amd/amdgpu/df_v4_15.h new file mode 100644 index 000000000000..dddf2422112a --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/df_v4_15.h @@ -0,0 +1,30 @@ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __DF_V4_15_H__ +#define __DF_V4_15_H__ + +extern const struct amdgpu_df_funcs df_v4_15_funcs; + +#endif /* __DF_V4_15_H__ */ + diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c index d27fb4ea6612..7d641d0dadba 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc24.c +++ b/drivers/gpu/drm/amd/amdgpu/soc24.c @@ -484,6 +484,10 @@ static int soc24_common_hw_init(void *handle) */ if (adev->nbio.funcs->remap_hdp_registers) adev->nbio.funcs->remap_hdp_registers(adev); + + if (adev->df.funcs->hw_init) + adev->df.funcs->hw_init(adev); + /* enable the doorbell aperture */ soc24_enable_doorbell_aperture(adev, true); diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h new file mode 100644 index 000000000000..c2b009752f60 --- /dev/null +++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _df_4_15_OFFSET_HEADER +#define _df_4_15_OFFSET_HEADER + +#define regNCSConfigurationRegister1 0x0901 +#define regNCSConfigurationRegister1_BASE_IDX 4 + +#endif diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h new file mode 100644 index 000000000000..9868a9c32795 --- /dev/null +++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _df_4_15_SH_MASK_HEADER +#define _df_4_15_SH_MASK_HEADER + +#define NCSConfigurationRegister1__DisIntAtomicsLclProcessing__SHIFT 0x3 +#define NCSConfigurationRegister1__DisIntAtomicsLclProcessing_MASK 0x0003FFF8L + +#endif -- cgit From df65aabef3c0327c23b840ab5520150df4db6b5f Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 18 Jul 2024 22:17:35 +0800 Subject: drm/amd/amdgpu: Fix uninitialized variable warnings Return 0 to avoid returning an uninitialized variable r. Cc: stable@vger.kernel.org Fixes: 230dd6bb6117 ("drm/amd/amdgpu: implement mode2 reset on smu_v13_0_10") Signed-off-by: Ma Ke Signed-off-by: Alex Deucher (cherry picked from commit 6472de66c0aa18d50a4b5ca85f8272e88a737676) --- drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c index 04c797d54511..0af648931df5 100644 --- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c +++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c @@ -91,7 +91,7 @@ static int smu_v13_0_10_mode2_suspend_ip(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; } - return r; + return 0; } static int -- cgit From 23df34997d386f1442d26f2d8edf44a5b5fee79f Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 11 Dec 2023 10:45:38 +0530 Subject: drm/amdgpu: Add empty HDP flush function to JPEG v4.0.3 JPEG v4.0.3 doesn't support HDP flush when RRMT is enabled. Instead, mmsch fw will do the flush. This change is necessary for JPEG v4.0.3, no need for backward compatibility Signed-off-by: Lijo Lazar Signed-off-by: Jane Jian Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 585e3fdb36f59c5cfed0ae06c852dc1df22b1d60) --- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index 04d8966423de..30a143ab592d 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -621,6 +621,13 @@ static uint64_t jpeg_v4_0_3_dec_ring_get_wptr(struct amdgpu_ring *ring) ring->pipe ? (0x40 * ring->pipe - 0xc80) : 0); } +static void jpeg_v4_0_3_ring_emit_hdp_flush(struct amdgpu_ring *ring) +{ + /* JPEG engine access for HDP flush doesn't work when RRMT is enabled. + * This is a workaround to avoid any HDP flush through JPEG ring. + */ +} + /** * jpeg_v4_0_3_dec_ring_set_wptr - set write pointer * @@ -1072,6 +1079,7 @@ static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = { .emit_ib = jpeg_v4_0_3_dec_ring_emit_ib, .emit_fence = jpeg_v4_0_3_dec_ring_emit_fence, .emit_vm_flush = jpeg_v4_0_3_dec_ring_emit_vm_flush, + .emit_hdp_flush = jpeg_v4_0_3_ring_emit_hdp_flush, .test_ring = amdgpu_jpeg_dec_ring_test_ring, .test_ib = amdgpu_jpeg_dec_ring_test_ib, .insert_nop = jpeg_v4_0_3_dec_ring_nop, -- cgit From 485432d090016aebf8fb407a44ce7a2f856d11eb Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 11 Dec 2023 11:18:42 +0530 Subject: drm/amdgpu: Add empty HDP flush function to VCN v4.0.3 VCN 4.0.3 does not HDP flush with RRMT enabled. Instead, mmsch will do the HDP flush. This change is necessary for VCN v4.0.3, no need for backward compatibility Signed-off-by: Lijo Lazar Signed-off-by: Jane Jian Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 49cfaebe48e97500a68d5322a8194736b0a2c3cf) --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index f53054e39ebb..101b120f6fbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -1375,6 +1375,13 @@ static uint64_t vcn_v4_0_3_unified_ring_get_wptr(struct amdgpu_ring *ring) regUVD_RB_WPTR); } +static void vcn_v4_0_3_ring_emit_hdp_flush(struct amdgpu_ring *ring) +{ + /* VCN engine access for HDP flush doesn't work when RRMT is enabled. + * This is a workaround to avoid any HDP flush through VCN ring. + */ +} + /** * vcn_v4_0_3_unified_ring_set_wptr - set enc write pointer * @@ -1415,6 +1422,7 @@ static const struct amdgpu_ring_funcs vcn_v4_0_3_unified_ring_vm_funcs = { .emit_ib = vcn_v2_0_enc_ring_emit_ib, .emit_fence = vcn_v2_0_enc_ring_emit_fence, .emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush, + .emit_hdp_flush = vcn_v4_0_3_ring_emit_hdp_flush, .test_ring = amdgpu_vcn_enc_ring_test_ring, .test_ib = amdgpu_vcn_unified_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, -- cgit From 6728f55590a667c292cee10c009e16d90f48d63a Mon Sep 17 00:00:00 2001 From: Jane Jian Date: Mon, 15 Jul 2024 18:48:31 +0800 Subject: drm/amdgpu/vcn: Use offsets local to VCN/JPEG in VF For VCN/JPEG 4.0.3, use only the local addressing scheme. - Mask bit higher than AID0 range v2 remain the case for mmhub use master XCC Signed-off-by: Jane Jian Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher (cherry picked from commit caaf576292f8ccef5cdc0ac16e77b87dbf6e17ab) --- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 19 +++++++++++-- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 46 +++++++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index 30a143ab592d..ad524ddc9760 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -32,6 +32,9 @@ #include "vcn/vcn_4_0_3_sh_mask.h" #include "ivsrcid/vcn/irqsrcs_vcn_4_0.h" +#define NORMALIZE_JPEG_REG_OFFSET(offset) \ + (offset & 0x1FFFF) + enum jpeg_engin_status { UVD_PGFSM_STATUS__UVDJ_PWR_ON = 0, UVD_PGFSM_STATUS__UVDJ_PWR_OFF = 2, @@ -824,7 +827,13 @@ void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring, void jpeg_v4_0_3_dec_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg, uint32_t val, uint32_t mask) { - uint32_t reg_offset = (reg << 2); + uint32_t reg_offset; + + /* For VF, only local offsets should be used */ + if (amdgpu_sriov_vf(ring->adev)) + reg = NORMALIZE_JPEG_REG_OFFSET(reg); + + reg_offset = (reg << 2); amdgpu_ring_write(ring, PACKETJ(regUVD_JRBC_RB_COND_RD_TIMER_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); @@ -865,7 +874,13 @@ void jpeg_v4_0_3_dec_ring_emit_vm_flush(struct amdgpu_ring *ring, void jpeg_v4_0_3_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val) { - uint32_t reg_offset = (reg << 2); + uint32_t reg_offset; + + /* For VF, only local offsets should be used */ + if (amdgpu_sriov_vf(ring->adev)) + reg = NORMALIZE_JPEG_REG_OFFSET(reg); + + reg_offset = (reg << 2); amdgpu_ring_write(ring, PACKETJ(regUVD_JRBC_EXTERNAL_REG_INTERNAL_OFFSET, 0, 0, PACKETJ_TYPE0)); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 101b120f6fbd..9bae95538b62 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -45,6 +45,9 @@ #define VCN_VID_SOC_ADDRESS_2_0 0x1fb00 #define VCN1_VID_SOC_ADDRESS_3_0 0x48300 +#define NORMALIZE_VCN_REG_OFFSET(offset) \ + (offset & 0x1FFFF) + static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev); static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device *adev); static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev); @@ -1375,6 +1378,43 @@ static uint64_t vcn_v4_0_3_unified_ring_get_wptr(struct amdgpu_ring *ring) regUVD_RB_WPTR); } +static void vcn_v4_0_3_enc_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg, + uint32_t val, uint32_t mask) +{ + /* For VF, only local offsets should be used */ + if (amdgpu_sriov_vf(ring->adev)) + reg = NORMALIZE_VCN_REG_OFFSET(reg); + + amdgpu_ring_write(ring, VCN_ENC_CMD_REG_WAIT); + amdgpu_ring_write(ring, reg << 2); + amdgpu_ring_write(ring, mask); + amdgpu_ring_write(ring, val); +} + +static void vcn_v4_0_3_enc_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val) +{ + /* For VF, only local offsets should be used */ + if (amdgpu_sriov_vf(ring->adev)) + reg = NORMALIZE_VCN_REG_OFFSET(reg); + + amdgpu_ring_write(ring, VCN_ENC_CMD_REG_WRITE); + amdgpu_ring_write(ring, reg << 2); + amdgpu_ring_write(ring, val); +} + +static void vcn_v4_0_3_enc_ring_emit_vm_flush(struct amdgpu_ring *ring, + unsigned int vmid, uint64_t pd_addr) +{ + struct amdgpu_vmhub *hub = &ring->adev->vmhub[ring->vm_hub]; + + pd_addr = amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr); + + /* wait for reg writes */ + vcn_v4_0_3_enc_ring_emit_reg_wait(ring, hub->ctx0_ptb_addr_lo32 + + vmid * hub->ctx_addr_distance, + lower_32_bits(pd_addr), 0xffffffff); +} + static void vcn_v4_0_3_ring_emit_hdp_flush(struct amdgpu_ring *ring) { /* VCN engine access for HDP flush doesn't work when RRMT is enabled. @@ -1421,7 +1461,7 @@ static const struct amdgpu_ring_funcs vcn_v4_0_3_unified_ring_vm_funcs = { .emit_ib_size = 5, /* vcn_v2_0_enc_ring_emit_ib */ .emit_ib = vcn_v2_0_enc_ring_emit_ib, .emit_fence = vcn_v2_0_enc_ring_emit_fence, - .emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush, + .emit_vm_flush = vcn_v4_0_3_enc_ring_emit_vm_flush, .emit_hdp_flush = vcn_v4_0_3_ring_emit_hdp_flush, .test_ring = amdgpu_vcn_enc_ring_test_ring, .test_ib = amdgpu_vcn_unified_ring_test_ib, @@ -1430,8 +1470,8 @@ static const struct amdgpu_ring_funcs vcn_v4_0_3_unified_ring_vm_funcs = { .pad_ib = amdgpu_ring_generic_pad_ib, .begin_use = amdgpu_vcn_ring_begin_use, .end_use = amdgpu_vcn_ring_end_use, - .emit_wreg = vcn_v2_0_enc_ring_emit_wreg, - .emit_reg_wait = vcn_v2_0_enc_ring_emit_reg_wait, + .emit_wreg = vcn_v4_0_3_enc_ring_emit_wreg, + .emit_reg_wait = vcn_v4_0_3_enc_ring_emit_reg_wait, .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, }; -- cgit From 4ab68e168ae1695f7c04fae98930740aaf7c50fa Mon Sep 17 00:00:00 2001 From: Sung Joon Kim Date: Mon, 8 Jul 2024 19:29:49 -0400 Subject: drm/amd/display: Check for NULL pointer [why & how] Need to make sure plane_state is initialized before accessing its members. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Xi (Alex) Liu Signed-off-by: Sung Joon Kim Signed-off-by: Aurabindo Pillai Signed-off-by: Alex Deucher (cherry picked from commit 295d91cbc700651782a60572f83c24861607b648) --- drivers/gpu/drm/amd/display/dc/core/dc_surface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_surface.c b/drivers/gpu/drm/amd/display/dc/core/dc_surface.c index 067f6555cfdf..ccbb15f1638c 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_surface.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_surface.c @@ -143,7 +143,8 @@ const struct dc_plane_status *dc_plane_get_status( if (pipe_ctx->plane_state != plane_state) continue; - pipe_ctx->plane_state->status.is_flip_pending = false; + if (pipe_ctx->plane_state) + pipe_ctx->plane_state->status.is_flip_pending = false; break; } -- cgit From 5302d1a06a2cd9855378122a07c9e0942f0f04a9 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 4 Jul 2024 11:54:34 -0600 Subject: drm/amd/display: Remove ASSERT if significance is zero in math_ceil2 In the DML math_ceil2 function, there is one ASSERT if the significance is equal to zero. However, significance might be equal to zero sometimes, and this is not an issue for a ceil function, but the current ASSERT will trigger warnings in those cases. This commit removes the ASSERT if the significance is equal to zero to avoid unnecessary noise. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Chaitanya Dhere Signed-off-by: Rodrigo Siqueira Signed-off-by: Aurabindo Pillai Signed-off-by: Alex Deucher (cherry picked from commit 332315885d3ccc6d8fe99700f3c2e4c24aa65ab7) --- .../dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c index defe13436a2c..e73579f1a88e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c @@ -64,8 +64,6 @@ double math_ceil(const double arg) double math_ceil2(const double arg, const double significance) { - ASSERT(significance != 0); - return ((int)(arg / significance + 0.99999)) * significance; } -- cgit From afac8c6554ccee54bfd1743755e10af005be3bcf Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 19 Jul 2024 20:43:04 +0800 Subject: drm/amdgpu: fix ras UE error injection failure issue The ras command shared memory is allocated from VRAM and the response status of the command buffer will not be zero due to gpu being in fatal error state after ras UE error injection. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 8284951a6e79c6806c675e5f68a4cd425dd56bc4) --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 7cdff355cedb..189574d53ebd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1630,9 +1630,7 @@ static int psp_ras_send_cmd(struct psp_context *psp, switch (cmd) { case TA_RAS_COMMAND__TRIGGER_ERROR: - if (ret || psp->cmd_buf_mem->resp.status) - ret = -EINVAL; - else if (out) + if (!ret && out) memcpy(out, &ras_cmd->ras_status, sizeof(ras_cmd->ras_status)); break; case TA_RAS_COMMAND__QUERY_ADDRESS: -- cgit From 1a8825259a9ccc53faddcdec24cf94e0a36b32cc Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 18 Jul 2024 10:58:04 +0800 Subject: drm/amdgpu: Fix eeprom max record count The eeprom table is empty before initializing, set eeprom table version first before initializing. Changed from V1: Reuse amdgpu_ras_set_eeprom_table_version function Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 015b8a2fdf39a4c288ff24e7b715b8d9198e56dc) --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index eae0a555df3c..aab8077e5098 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1011,6 +1011,9 @@ Out: uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control) { + /* get available eeprom table version first before eeprom table init */ + amdgpu_ras_set_eeprom_table_version(control); + if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) return RAS_MAX_RECORD_COUNT_V2_1; else -- cgit From fab1ead0ae3a4757afb92ff6909b37d63db17e55 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Tue, 23 Jul 2024 16:54:34 +0800 Subject: drm/amdgpu: add missed harvest check for VCN IP v4/v5 To prevent below probe failure, add a check for models with VCN IP v4.0.6 where VCN1 may be harvested. v2: Apply the same check to VCN IP v4.0 and v5.0. [ 54.070117] RIP: 0010:vcn_v4_0_5_start_dpg_mode+0x9be/0x36b0 [amdgpu] [ 54.071055] Code: 80 fb ff 8d 82 00 80 fe ff 81 fe 00 06 00 00 0f 43 c2 49 69 d5 38 0d 00 00 48 8d 71 04 c1 e8 02 4c 01 f2 48 89 b2 50 f6 02 00 <89> 01 48 8b 82 50 f6 02 00 48 8d 48 04 48 89 8a 50 f6 02 00 c7 00 [ 54.072408] RSP: 0018:ffffb17985f736f8 EFLAGS: 00010286 [ 54.072793] RAX: 00000000000000d6 RBX: ffff99a82f680000 RCX: 0000000000000000 [ 54.073315] RDX: ffff99a82f680000 RSI: 0000000000000004 RDI: ffff99a82f680000 [ 54.073835] RBP: ffffb17985f73730 R08: 0000000000000001 R09: 0000000000000000 [ 54.074353] R10: 0000000000000008 R11: ffffb17983c05000 R12: 0000000000000000 [ 54.074879] R13: 0000000000000000 R14: ffff99a82f680000 R15: 0000000000000001 [ 54.075400] FS: 00007f8d9c79a000(0000) GS:ffff99ab2f140000(0000) knlGS:0000000000000000 [ 54.075988] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 54.076408] CR2: 0000000000000000 CR3: 0000000140c3a000 CR4: 0000000000750ef0 [ 54.076927] PKRU: 55555554 [ 54.077132] Call Trace: [ 54.077319] [ 54.077484] ? show_regs+0x69/0x80 [ 54.077747] ? __die+0x28/0x70 [ 54.077979] ? page_fault_oops+0x180/0x4b0 [ 54.078286] ? do_user_addr_fault+0x2d2/0x680 [ 54.078610] ? exc_page_fault+0x84/0x190 [ 54.078910] ? asm_exc_page_fault+0x2b/0x30 [ 54.079224] ? vcn_v4_0_5_start_dpg_mode+0x9be/0x36b0 [amdgpu] [ 54.079941] ? vcn_v4_0_5_start_dpg_mode+0xe6/0x36b0 [amdgpu] [ 54.080617] vcn_v4_0_5_set_powergating_state+0x82/0x19b0 [amdgpu] [ 54.081316] amdgpu_device_ip_set_powergating_state+0x64/0xc0 [amdgpu] [ 54.082057] amdgpu_vcn_ring_begin_use+0x6f/0x1d0 [amdgpu] [ 54.082727] amdgpu_ring_alloc+0x44/0x70 [amdgpu] [ 54.083351] amdgpu_vcn_dec_sw_ring_test_ring+0x40/0x110 [amdgpu] [ 54.084054] amdgpu_ring_test_helper+0x22/0x90 [amdgpu] [ 54.084698] vcn_v4_0_5_hw_init+0x87/0xc0 [amdgpu] [ 54.085307] amdgpu_device_init+0x1f96/0x2780 [amdgpu] [ 54.085951] amdgpu_driver_load_kms+0x1e/0xc0 [amdgpu] [ 54.086591] amdgpu_pci_probe+0x19f/0x550 [amdgpu] [ 54.087215] local_pci_probe+0x48/0xa0 [ 54.087509] pci_device_probe+0xc9/0x250 [ 54.087812] really_probe+0x1a4/0x3f0 [ 54.088101] __driver_probe_device+0x7d/0x170 [ 54.088443] driver_probe_device+0x24/0xa0 [ 54.088765] __driver_attach+0xdd/0x1d0 [ 54.089068] ? __pfx___driver_attach+0x10/0x10 [ 54.089417] bus_for_each_dev+0x8e/0xe0 [ 54.089718] driver_attach+0x22/0x30 [ 54.090000] bus_add_driver+0x120/0x220 [ 54.090303] driver_register+0x62/0x120 [ 54.090606] ? __pfx_amdgpu_init+0x10/0x10 [amdgpu] [ 54.091255] __pci_register_driver+0x62/0x70 [ 54.091593] amdgpu_init+0x67/0xff0 [amdgpu] [ 54.092190] do_one_initcall+0x5f/0x330 [ 54.092495] do_init_module+0x68/0x240 [ 54.092794] load_module+0x201c/0x2110 [ 54.093093] init_module_from_file+0x97/0xd0 [ 54.093428] ? init_module_from_file+0x97/0xd0 [ 54.093777] idempotent_init_module+0x11c/0x2a0 [ 54.094134] __x64_sys_finit_module+0x64/0xc0 [ 54.094476] do_syscall_64+0x58/0x120 [ 54.094767] entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Tim Huang Reviewed-by: Saleemkhan Jamadar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org (cherry picked from commit 0b071245ddd98539d4f7493bdd188417fcf2d629) --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index f6d96a44d75f..776c539bfdda 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1045,6 +1045,9 @@ static int vcn_v4_0_start(struct amdgpu_device *adev) amdgpu_dpm_enable_uvd(adev, true); for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { @@ -1498,6 +1501,9 @@ static int vcn_v4_0_stop(struct amdgpu_device *adev) int i, r = 0; for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; fw_shared->sq.queue_mode |= FW_QUEUE_DPG_HOLD_OFF; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index f45495de6875..8d75061f9f38 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -958,6 +958,9 @@ static int vcn_v4_0_5_start(struct amdgpu_device *adev) amdgpu_dpm_enable_uvd(adev, true); for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { @@ -1162,6 +1165,9 @@ static int vcn_v4_0_5_stop(struct amdgpu_device *adev) int i, r = 0; for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; fw_shared->sq.queue_mode |= FW_QUEUE_DPG_HOLD_OFF; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index 070b56610c7d..68c97fcd539b 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -721,6 +721,9 @@ static int vcn_v5_0_0_start(struct amdgpu_device *adev) amdgpu_dpm_enable_uvd(adev, true); for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { @@ -898,6 +901,9 @@ static int vcn_v5_0_0_stop(struct amdgpu_device *adev) int i, r = 0; for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + if (adev->vcn.harvest_config & (1 << i)) + continue; + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; fw_shared->sq.queue_mode |= FW_QUEUE_DPG_HOLD_OFF; -- cgit From 5659b0c93a1ea02c662a030b322093203f299185 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Fri, 19 Jul 2024 16:10:40 +0800 Subject: drm/amdgpu: reset vm state machine after gpu reset(vram lost) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] Page table of compute VM in the VRAM will lost after gpu reset. VRAM won't be restored since compute VM has no shadows. [How] Use higher 32-bit of vm->generation to record a vram_lost_counter. Reset the VM state machine when vm->genertaion is not equal to the new generation token. v2: Check vm->generation instead of calling drm_sched_entity_error in amdgpu_vm_validate. v3: Use new generation token instead of vram_lost_counter for check. Signed-off-by: ZhenGuo Yin Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org (cherry picked from commit 47c0388b0589cb481c294dcb857d25a214c46eb3) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 3abfa66d72a2..a060c28f0877 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -434,7 +434,7 @@ uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm) if (!vm) return result; - result += vm->generation; + result += lower_32_bits(vm->generation); /* Add one if the page tables will be re-generated on next CS */ if (drm_sched_entity_error(&vm->delayed)) ++result; @@ -463,13 +463,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, int (*validate)(void *p, struct amdgpu_bo *bo), void *param) { + uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); struct amdgpu_vm_bo_base *bo_base; struct amdgpu_bo *shadow; struct amdgpu_bo *bo; int r; - if (drm_sched_entity_error(&vm->delayed)) { - ++vm->generation; + if (vm->generation != new_vm_generation) { + vm->generation = new_vm_generation; amdgpu_vm_bo_reset_state_machine(vm); amdgpu_vm_fini_entities(vm); r = amdgpu_vm_init_entities(adev, vm); @@ -2439,7 +2440,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->last_update = dma_fence_get_stub(); vm->last_unlocked = dma_fence_get_stub(); vm->last_tlb_flush = dma_fence_get_stub(); - vm->generation = 0; + vm->generation = amdgpu_vm_generation(adev, NULL); mutex_init(&vm->eviction_lock); vm->evicting = false; -- cgit From dcfed708742c7ab5e0c9cfcc673cb4e5fafb18ff Mon Sep 17 00:00:00 2001 From: Nick Weihs Date: Wed, 24 Jul 2024 22:47:22 -0700 Subject: ALSA: hda/realtek: Implement sound init sequence for Samsung Galaxy Book3 Pro 360 Samsung Galaxy Book3 Pro 360 sends a large amount of data to the codec through hda processing coefficients. This data was captured using a modified version of QEMU, but the actual content of the data remains opaque to me. Elliding any part of the data seems to cause sound to not work. Signed-off-by: Nick Weihs Link: https://patch.msgid.link/20240725054722.42597-1-nick.weihs@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 9 ++ sound/pci/hda/samsung_helper.c | 310 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 sound/pci/hda/samsung_helper.c diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 462194bf6954..ba0ce8750ca4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4800,6 +4800,8 @@ static void alc298_fixup_samsung_amp(struct hda_codec *codec, } } +#include "samsung_helper.c" + #if IS_REACHABLE(CONFIG_INPUT) static void gpio2_mic_hotkey_event(struct hda_codec *codec, struct hda_jack_callback *event) @@ -7429,6 +7431,7 @@ enum { ALC236_FIXUP_HP_MUTE_LED, ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF, ALC298_FIXUP_SAMSUNG_AMP, + ALC298_FIXUP_SAMSUNG_AMP2, ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET, ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET, ALC295_FIXUP_ASUS_MIC_NO_PRESENCE, @@ -9055,6 +9058,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET }, + [ALC298_FIXUP_SAMSUNG_AMP2] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc298_fixup_samsung_amp2 + }, [ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -10406,6 +10413,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc832, "Samsung Galaxy Book Flex Alpha (NP730QCJ)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc868, "Samsung Galaxy Book2 Pro (NP930XED)", ALC298_FIXUP_SAMSUNG_AMP), + SND_PCI_QUIRK(0x144d, 0xc1ca, "Samsung Galaxy Book3 Pro 360 (NP960QFG-KB1US)", ALC298_FIXUP_SAMSUNG_AMP2), SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), @@ -10843,6 +10851,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC298_FIXUP_HUAWEI_MBX_STEREO, .name = "huawei-mbx-stereo"}, {.id = ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, .name = "alc256-medion-headset"}, {.id = ALC298_FIXUP_SAMSUNG_AMP, .name = "alc298-samsung-amp"}, + {.id = ALC298_FIXUP_SAMSUNG_AMP2, .name = "alc298-samsung-amp2"}, {.id = ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET, .name = "alc256-samsung-headphone"}, {.id = ALC255_FIXUP_XIAOMI_HEADSET_MIC, .name = "alc255-xiaomi-headset"}, {.id = ALC274_FIXUP_HP_MIC, .name = "alc274-hp-mic-detect"}, diff --git a/sound/pci/hda/samsung_helper.c b/sound/pci/hda/samsung_helper.c new file mode 100644 index 000000000000..a40175b69015 --- /dev/null +++ b/sound/pci/hda/samsung_helper.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Helper functions for Samsung Galaxy Book3 audio initialization */ + +struct alc298_samsung_coeff_fixup_desc { + unsigned char coeff_idx; + unsigned short coeff_value; +}; + +struct alc298_samsung_coeff_seq_desc { + unsigned short coeff_0x23; + unsigned short coeff_0x24; + unsigned short coeff_0x25; + unsigned short coeff_0x26; +}; + + +static inline void alc298_samsung_write_coef_pack2(struct hda_codec *codec, + const struct alc298_samsung_coeff_seq_desc *seq) +{ + int i; + + for (i = 0; i < 100; i++) { + if ((alc_read_coef_idx(codec, 0x26) & 0x0010) == 0) + break; + + usleep_range(500, 1000); + } + + alc_write_coef_idx(codec, 0x23, seq->coeff_0x23); + alc_write_coef_idx(codec, 0x24, seq->coeff_0x24); + alc_write_coef_idx(codec, 0x25, seq->coeff_0x25); + alc_write_coef_idx(codec, 0x26, seq->coeff_0x26); +} + +static inline void alc298_samsung_write_coef_pack_seq( + struct hda_codec *codec, + unsigned char target, + const struct alc298_samsung_coeff_seq_desc seq[], + int count) +{ + alc_write_coef_idx(codec, 0x22, target); + for (int i = 0; i < count; i++) + alc298_samsung_write_coef_pack2(codec, &seq[i]); +} + +static void alc298_fixup_samsung_amp2(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + int i; + static const struct alc298_samsung_coeff_fixup_desc fixups1[] = { + { 0x99, 0x8000 }, { 0x82, 0x4408 }, { 0x32, 0x3f00 }, { 0x0e, 0x6f80 }, + { 0x10, 0x0e21 }, { 0x55, 0x8000 }, { 0x08, 0x2fcf }, { 0x08, 0x2fcf }, + { 0x2d, 0xc020 }, { 0x19, 0x0017 }, { 0x50, 0x1000 }, { 0x0e, 0x6f80 }, + { 0x08, 0x2fcf }, { 0x80, 0x0011 }, { 0x2b, 0x0c10 }, { 0x2d, 0xc020 }, + { 0x03, 0x0042 }, { 0x0f, 0x0062 }, { 0x08, 0x2fcf }, + }; + + static const struct alc298_samsung_coeff_seq_desc amp_0x38[] = { + { 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 }, + { 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 }, + { 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 }, + { 0x201b, 0x0000, 0x0001, 0xb011 }, { 0x201d, 0x0000, 0x0001, 0xb011 }, + { 0x201f, 0x0000, 0x00fe, 0xb011 }, { 0x2021, 0x0000, 0x0000, 0xb011 }, + { 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 }, + { 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 }, + { 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 }, + { 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x2399, 0x0000, 0x0003, 0xb011 }, + { 0x23a4, 0x0000, 0x00b5, 0xb011 }, { 0x23a5, 0x0000, 0x0001, 0xb011 }, + { 0x23ba, 0x0000, 0x0094, 0xb011 }, { 0x2100, 0x00d0, 0x950e, 0xb017 }, + { 0x2104, 0x0061, 0xd4e2, 0xb017 }, { 0x2108, 0x00d0, 0x950e, 0xb017 }, + { 0x210c, 0x0075, 0xf4e2, 0xb017 }, { 0x2110, 0x00b4, 0x4b0d, 0xb017 }, + { 0x2114, 0x000a, 0x1000, 0xb017 }, { 0x2118, 0x0015, 0x2000, 0xb017 }, + { 0x211c, 0x000a, 0x1000, 0xb017 }, { 0x2120, 0x0075, 0xf4e2, 0xb017 }, + { 0x2124, 0x00b4, 0x4b0d, 0xb017 }, { 0x2128, 0x0000, 0x0010, 0xb017 }, + { 0x212c, 0x0000, 0x0000, 0xb017 }, { 0x2130, 0x0000, 0x0000, 0xb017 }, + { 0x2134, 0x0000, 0x0000, 0xb017 }, { 0x2138, 0x0000, 0x0000, 0xb017 }, + { 0x213c, 0x0000, 0x0010, 0xb017 }, { 0x2140, 0x0000, 0x0000, 0xb017 }, + { 0x2144, 0x0000, 0x0000, 0xb017 }, { 0x2148, 0x0000, 0x0000, 0xb017 }, + { 0x214c, 0x0000, 0x0000, 0xb017 }, { 0x2150, 0x0000, 0x0010, 0xb017 }, + { 0x2154, 0x0000, 0x0000, 0xb017 }, { 0x2158, 0x0000, 0x0000, 0xb017 }, + { 0x215c, 0x0000, 0x0000, 0xb017 }, { 0x2160, 0x0000, 0x0000, 0xb017 }, + { 0x2164, 0x0000, 0x0010, 0xb017 }, { 0x2168, 0x0000, 0x0000, 0xb017 }, + { 0x216c, 0x0000, 0x0000, 0xb017 }, { 0x2170, 0x0000, 0x0000, 0xb017 }, + { 0x2174, 0x0000, 0x0000, 0xb017 }, { 0x2178, 0x0000, 0x0010, 0xb017 }, + { 0x217c, 0x0000, 0x0000, 0xb017 }, { 0x2180, 0x0000, 0x0000, 0xb017 }, + { 0x2184, 0x0000, 0x0000, 0xb017 }, { 0x2188, 0x0000, 0x0000, 0xb017 }, + { 0x218c, 0x0064, 0x5800, 0xb017 }, { 0x2190, 0x00c8, 0xb000, 0xb017 }, + { 0x2194, 0x0064, 0x5800, 0xb017 }, { 0x2198, 0x003d, 0x5be7, 0xb017 }, + { 0x219c, 0x0054, 0x060a, 0xb017 }, { 0x21a0, 0x00c8, 0xa310, 0xb017 }, + { 0x21a4, 0x0029, 0x4de5, 0xb017 }, { 0x21a8, 0x0032, 0x420c, 0xb017 }, + { 0x21ac, 0x0029, 0x4de5, 0xb017 }, { 0x21b0, 0x00fa, 0xe50c, 0xb017 }, + { 0x21b4, 0x0000, 0x0010, 0xb017 }, { 0x21b8, 0x0000, 0x0000, 0xb017 }, + { 0x21bc, 0x0000, 0x0000, 0xb017 }, { 0x21c0, 0x0000, 0x0000, 0xb017 }, + { 0x21c4, 0x0000, 0x0000, 0xb017 }, { 0x21c8, 0x0056, 0xc50f, 0xb017 }, + { 0x21cc, 0x007b, 0xd7e1, 0xb017 }, { 0x21d0, 0x0077, 0xa70e, 0xb017 }, + { 0x21d4, 0x00e0, 0xbde1, 0xb017 }, { 0x21d8, 0x0032, 0x530e, 0xb017 }, + { 0x2204, 0x00fb, 0x7e0f, 0xb017 }, { 0x2208, 0x000b, 0x02e1, 0xb017 }, + { 0x220c, 0x00fb, 0x7e0f, 0xb017 }, { 0x2210, 0x00d5, 0x17e1, 0xb017 }, + { 0x2214, 0x00c0, 0x130f, 0xb017 }, { 0x2218, 0x00e5, 0x0a00, 0xb017 }, + { 0x221c, 0x00cb, 0x1500, 0xb017 }, { 0x2220, 0x00e5, 0x0a00, 0xb017 }, + { 0x2224, 0x00d5, 0x17e1, 0xb017 }, { 0x2228, 0x00c0, 0x130f, 0xb017 }, + { 0x222c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2230, 0x0017, 0x48e2, 0xb017 }, + { 0x2234, 0x00f5, 0xdb0e, 0xb017 }, { 0x2238, 0x00ef, 0x5ce2, 0xb017 }, + { 0x223c, 0x00c1, 0xcc0d, 0xb017 }, { 0x2240, 0x00f5, 0xdb0e, 0xb017 }, + { 0x2244, 0x0017, 0x48e2, 0xb017 }, { 0x2248, 0x00f5, 0xdb0e, 0xb017 }, + { 0x224c, 0x00ef, 0x5ce2, 0xb017 }, { 0x2250, 0x00c1, 0xcc0d, 0xb017 }, + { 0x2254, 0x00f5, 0xdb0e, 0xb017 }, { 0x2258, 0x0017, 0x48e2, 0xb017 }, + { 0x225c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2260, 0x00ef, 0x5ce2, 0xb017 }, + { 0x2264, 0x00c1, 0xcc0d, 0xb017 }, { 0x2268, 0x00f5, 0xdb0e, 0xb017 }, + { 0x226c, 0x0017, 0x48e2, 0xb017 }, { 0x2270, 0x00f5, 0xdb0e, 0xb017 }, + { 0x2274, 0x00ef, 0x5ce2, 0xb017 }, { 0x2278, 0x00c1, 0xcc0d, 0xb017 }, + { 0x227c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2280, 0x0017, 0x48e2, 0xb017 }, + { 0x2284, 0x00f5, 0xdb0e, 0xb017 }, { 0x2288, 0x00ef, 0x5ce2, 0xb017 }, + { 0x228c, 0x00c1, 0xcc0d, 0xb017 }, { 0x22cc, 0x00e8, 0x8d00, 0xb017 }, + { 0x22d0, 0x0000, 0x0000, 0xb017 }, { 0x22d4, 0x0018, 0x72ff, 0xb017 }, + { 0x22d8, 0x00ce, 0x25e1, 0xb017 }, { 0x22dc, 0x002f, 0xe40e, 0xb017 }, + { 0x238e, 0x0000, 0x0099, 0xb011 }, { 0x238f, 0x0000, 0x0011, 0xb011 }, + { 0x2390, 0x0000, 0x0056, 0xb011 }, { 0x2391, 0x0000, 0x0004, 0xb011 }, + { 0x2392, 0x0000, 0x00bb, 0xb011 }, { 0x2393, 0x0000, 0x006d, 0xb011 }, + { 0x2394, 0x0000, 0x0010, 0xb011 }, { 0x2395, 0x0000, 0x0064, 0xb011 }, + { 0x2396, 0x0000, 0x00b6, 0xb011 }, { 0x2397, 0x0000, 0x0028, 0xb011 }, + { 0x2398, 0x0000, 0x000b, 0xb011 }, { 0x239a, 0x0000, 0x0099, 0xb011 }, + { 0x239b, 0x0000, 0x000d, 0xb011 }, { 0x23a6, 0x0000, 0x0064, 0xb011 }, + { 0x23a7, 0x0000, 0x0078, 0xb011 }, { 0x23b9, 0x0000, 0x0000, 0xb011 }, + { 0x23e0, 0x0000, 0x0021, 0xb011 }, { 0x23e1, 0x0000, 0x0001, 0xb011 }, + }; + + static const struct alc298_samsung_coeff_seq_desc amp_0x39[] = { + { 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 }, + { 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 }, + { 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 }, + { 0x201b, 0x0000, 0x0002, 0xb011 }, { 0x201d, 0x0000, 0x0002, 0xb011 }, + { 0x201f, 0x0000, 0x00fd, 0xb011 }, { 0x2021, 0x0000, 0x0001, 0xb011 }, + { 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 }, + { 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 }, + { 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 }, + { 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x2399, 0x0000, 0x0003, 0xb011 }, + { 0x23a4, 0x0000, 0x00b5, 0xb011 }, { 0x23a5, 0x0000, 0x0001, 0xb011 }, + { 0x23ba, 0x0000, 0x0094, 0xb011 }, { 0x2100, 0x00d0, 0x950e, 0xb017 }, + { 0x2104, 0x0061, 0xd4e2, 0xb017 }, { 0x2108, 0x00d0, 0x950e, 0xb017 }, + { 0x210c, 0x0075, 0xf4e2, 0xb017 }, { 0x2110, 0x00b4, 0x4b0d, 0xb017 }, + { 0x2114, 0x000a, 0x1000, 0xb017 }, { 0x2118, 0x0015, 0x2000, 0xb017 }, + { 0x211c, 0x000a, 0x1000, 0xb017 }, { 0x2120, 0x0075, 0xf4e2, 0xb017 }, + { 0x2124, 0x00b4, 0x4b0d, 0xb017 }, { 0x2128, 0x0000, 0x0010, 0xb017 }, + { 0x212c, 0x0000, 0x0000, 0xb017 }, { 0x2130, 0x0000, 0x0000, 0xb017 }, + { 0x2134, 0x0000, 0x0000, 0xb017 }, { 0x2138, 0x0000, 0x0000, 0xb017 }, + { 0x213c, 0x0000, 0x0010, 0xb017 }, { 0x2140, 0x0000, 0x0000, 0xb017 }, + { 0x2144, 0x0000, 0x0000, 0xb017 }, { 0x2148, 0x0000, 0x0000, 0xb017 }, + { 0x214c, 0x0000, 0x0000, 0xb017 }, { 0x2150, 0x0000, 0x0010, 0xb017 }, + { 0x2154, 0x0000, 0x0000, 0xb017 }, { 0x2158, 0x0000, 0x0000, 0xb017 }, + { 0x215c, 0x0000, 0x0000, 0xb017 }, { 0x2160, 0x0000, 0x0000, 0xb017 }, + { 0x2164, 0x0000, 0x0010, 0xb017 }, { 0x2168, 0x0000, 0x0000, 0xb017 }, + { 0x216c, 0x0000, 0x0000, 0xb017 }, { 0x2170, 0x0000, 0x0000, 0xb017 }, + { 0x2174, 0x0000, 0x0000, 0xb017 }, { 0x2178, 0x0000, 0x0010, 0xb017 }, + { 0x217c, 0x0000, 0x0000, 0xb017 }, { 0x2180, 0x0000, 0x0000, 0xb017 }, + { 0x2184, 0x0000, 0x0000, 0xb017 }, { 0x2188, 0x0000, 0x0000, 0xb017 }, + { 0x218c, 0x0064, 0x5800, 0xb017 }, { 0x2190, 0x00c8, 0xb000, 0xb017 }, + { 0x2194, 0x0064, 0x5800, 0xb017 }, { 0x2198, 0x003d, 0x5be7, 0xb017 }, + { 0x219c, 0x0054, 0x060a, 0xb017 }, { 0x21a0, 0x00c8, 0xa310, 0xb017 }, + { 0x21a4, 0x0029, 0x4de5, 0xb017 }, { 0x21a8, 0x0032, 0x420c, 0xb017 }, + { 0x21ac, 0x0029, 0x4de5, 0xb017 }, { 0x21b0, 0x00fa, 0xe50c, 0xb017 }, + { 0x21b4, 0x0000, 0x0010, 0xb017 }, { 0x21b8, 0x0000, 0x0000, 0xb017 }, + { 0x21bc, 0x0000, 0x0000, 0xb017 }, { 0x21c0, 0x0000, 0x0000, 0xb017 }, + { 0x21c4, 0x0000, 0x0000, 0xb017 }, { 0x21c8, 0x0056, 0xc50f, 0xb017 }, + { 0x21cc, 0x007b, 0xd7e1, 0xb017 }, { 0x21d0, 0x0077, 0xa70e, 0xb017 }, + { 0x21d4, 0x00e0, 0xbde1, 0xb017 }, { 0x21d8, 0x0032, 0x530e, 0xb017 }, + { 0x2204, 0x00fb, 0x7e0f, 0xb017 }, { 0x2208, 0x000b, 0x02e1, 0xb017 }, + { 0x220c, 0x00fb, 0x7e0f, 0xb017 }, { 0x2210, 0x00d5, 0x17e1, 0xb017 }, + { 0x2214, 0x00c0, 0x130f, 0xb017 }, { 0x2218, 0x00e5, 0x0a00, 0xb017 }, + { 0x221c, 0x00cb, 0x1500, 0xb017 }, { 0x2220, 0x00e5, 0x0a00, 0xb017 }, + { 0x2224, 0x00d5, 0x17e1, 0xb017 }, { 0x2228, 0x00c0, 0x130f, 0xb017 }, + { 0x222c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2230, 0x0017, 0x48e2, 0xb017 }, + { 0x2234, 0x00f5, 0xdb0e, 0xb017 }, { 0x2238, 0x00ef, 0x5ce2, 0xb017 }, + { 0x223c, 0x00c1, 0xcc0d, 0xb017 }, { 0x2240, 0x00f5, 0xdb0e, 0xb017 }, + { 0x2244, 0x0017, 0x48e2, 0xb017 }, { 0x2248, 0x00f5, 0xdb0e, 0xb017 }, + { 0x224c, 0x00ef, 0x5ce2, 0xb017 }, { 0x2250, 0x00c1, 0xcc0d, 0xb017 }, + { 0x2254, 0x00f5, 0xdb0e, 0xb017 }, { 0x2258, 0x0017, 0x48e2, 0xb017 }, + { 0x225c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2260, 0x00ef, 0x5ce2, 0xb017 }, + { 0x2264, 0x00c1, 0xcc0d, 0xb017 }, { 0x2268, 0x00f5, 0xdb0e, 0xb017 }, + { 0x226c, 0x0017, 0x48e2, 0xb017 }, { 0x2270, 0x00f5, 0xdb0e, 0xb017 }, + { 0x2274, 0x00ef, 0x5ce2, 0xb017 }, { 0x2278, 0x00c1, 0xcc0d, 0xb017 }, + { 0x227c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2280, 0x0017, 0x48e2, 0xb017 }, + { 0x2284, 0x00f5, 0xdb0e, 0xb017 }, { 0x2288, 0x00ef, 0x5ce2, 0xb017 }, + { 0x228c, 0x00c1, 0xcc0d, 0xb017 }, { 0x22cc, 0x00e8, 0x8d00, 0xb017 }, + { 0x22d0, 0x0000, 0x0000, 0xb017 }, { 0x22d4, 0x0018, 0x72ff, 0xb017 }, + { 0x22d8, 0x00ce, 0x25e1, 0xb017 }, { 0x22dc, 0x002f, 0xe40e, 0xb017 }, + { 0x238e, 0x0000, 0x0099, 0xb011 }, { 0x238f, 0x0000, 0x0011, 0xb011 }, + { 0x2390, 0x0000, 0x0056, 0xb011 }, { 0x2391, 0x0000, 0x0004, 0xb011 }, + { 0x2392, 0x0000, 0x00bb, 0xb011 }, { 0x2393, 0x0000, 0x006d, 0xb011 }, + { 0x2394, 0x0000, 0x0010, 0xb011 }, { 0x2395, 0x0000, 0x0064, 0xb011 }, + { 0x2396, 0x0000, 0x00b6, 0xb011 }, { 0x2397, 0x0000, 0x0028, 0xb011 }, + { 0x2398, 0x0000, 0x000b, 0xb011 }, { 0x239a, 0x0000, 0x0099, 0xb011 }, + { 0x239b, 0x0000, 0x000d, 0xb011 }, { 0x23a6, 0x0000, 0x0064, 0xb011 }, + { 0x23a7, 0x0000, 0x0078, 0xb011 }, { 0x23b9, 0x0000, 0x0000, 0xb011 }, + { 0x23e0, 0x0000, 0x0021, 0xb011 }, { 0x23e1, 0x0000, 0x0001, 0xb011 }, + }; + + static const struct alc298_samsung_coeff_seq_desc amp_0x3c[] = { + { 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 }, + { 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 }, + { 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 }, + { 0x201b, 0x0000, 0x0001, 0xb011 }, { 0x201d, 0x0000, 0x0001, 0xb011 }, + { 0x201f, 0x0000, 0x00fe, 0xb011 }, { 0x2021, 0x0000, 0x0000, 0xb011 }, + { 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 }, + { 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 }, + { 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 }, + { 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x23ba, 0x0000, 0x008d, 0xb011 }, + { 0x2128, 0x0005, 0x460d, 0xb017 }, { 0x212c, 0x00f6, 0x73e5, 0xb017 }, + { 0x2130, 0x0005, 0x460d, 0xb017 }, { 0x2134, 0x00c0, 0xe9e5, 0xb017 }, + { 0x2138, 0x00d5, 0x010b, 0xb017 }, { 0x213c, 0x009d, 0x7809, 0xb017 }, + { 0x2140, 0x00c5, 0x0eed, 0xb017 }, { 0x2144, 0x009d, 0x7809, 0xb017 }, + { 0x2148, 0x00c4, 0x4ef0, 0xb017 }, { 0x214c, 0x003a, 0x3106, 0xb017 }, + { 0x2150, 0x00af, 0x750e, 0xb017 }, { 0x2154, 0x008c, 0x1ff1, 0xb017 }, + { 0x2158, 0x009e, 0x360c, 0xb017 }, { 0x215c, 0x008c, 0x1ff1, 0xb017 }, + { 0x2160, 0x004d, 0xac0a, 0xb017 }, { 0x2164, 0x007d, 0xa00f, 0xb017 }, + { 0x2168, 0x00e1, 0x9ce3, 0xb017 }, { 0x216c, 0x00e8, 0x590e, 0xb017 }, + { 0x2170, 0x00e1, 0x9ce3, 0xb017 }, { 0x2174, 0x0066, 0xfa0d, 0xb017 }, + { 0x2178, 0x0000, 0x0010, 0xb017 }, { 0x217c, 0x0000, 0x0000, 0xb017 }, + { 0x2180, 0x0000, 0x0000, 0xb017 }, { 0x2184, 0x0000, 0x0000, 0xb017 }, + { 0x2188, 0x0000, 0x0000, 0xb017 }, { 0x218c, 0x0000, 0x0010, 0xb017 }, + { 0x2190, 0x0000, 0x0000, 0xb017 }, { 0x2194, 0x0000, 0x0000, 0xb017 }, + { 0x2198, 0x0000, 0x0000, 0xb017 }, { 0x219c, 0x0000, 0x0000, 0xb017 }, + { 0x21a0, 0x0000, 0x0010, 0xb017 }, { 0x21a4, 0x0000, 0x0000, 0xb017 }, + { 0x21a8, 0x0000, 0x0000, 0xb017 }, { 0x21ac, 0x0000, 0x0000, 0xb017 }, + { 0x21b0, 0x0000, 0x0000, 0xb017 }, { 0x21b4, 0x0000, 0x0010, 0xb017 }, + { 0x21b8, 0x0000, 0x0000, 0xb017 }, { 0x21bc, 0x0000, 0x0000, 0xb017 }, + { 0x21c0, 0x0000, 0x0000, 0xb017 }, { 0x21c4, 0x0000, 0x0000, 0xb017 }, + { 0x23b9, 0x0000, 0x0000, 0xb011 }, { 0x23e0, 0x0000, 0x0020, 0xb011 }, + { 0x23e1, 0x0000, 0x0001, 0xb011 }, + }; + + static const struct alc298_samsung_coeff_seq_desc amp_0x3d[] = { + { 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 }, + { 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 }, + { 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 }, + { 0x201b, 0x0000, 0x0002, 0xb011 }, { 0x201d, 0x0000, 0x0002, 0xb011 }, + { 0x201f, 0x0000, 0x00fd, 0xb011 }, { 0x2021, 0x0000, 0x0001, 0xb011 }, + { 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 }, + { 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 }, + { 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 }, + { 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x23ba, 0x0000, 0x008d, 0xb011 }, + { 0x2128, 0x0005, 0x460d, 0xb017 }, { 0x212c, 0x00f6, 0x73e5, 0xb017 }, + { 0x2130, 0x0005, 0x460d, 0xb017 }, { 0x2134, 0x00c0, 0xe9e5, 0xb017 }, + { 0x2138, 0x00d5, 0x010b, 0xb017 }, { 0x213c, 0x009d, 0x7809, 0xb017 }, + { 0x2140, 0x00c5, 0x0eed, 0xb017 }, { 0x2144, 0x009d, 0x7809, 0xb017 }, + { 0x2148, 0x00c4, 0x4ef0, 0xb017 }, { 0x214c, 0x003a, 0x3106, 0xb017 }, + { 0x2150, 0x00af, 0x750e, 0xb017 }, { 0x2154, 0x008c, 0x1ff1, 0xb017 }, + { 0x2158, 0x009e, 0x360c, 0xb017 }, { 0x215c, 0x008c, 0x1ff1, 0xb017 }, + { 0x2160, 0x004d, 0xac0a, 0xb017 }, { 0x2164, 0x007d, 0xa00f, 0xb017 }, + { 0x2168, 0x00e1, 0x9ce3, 0xb017 }, { 0x216c, 0x00e8, 0x590e, 0xb017 }, + { 0x2170, 0x00e1, 0x9ce3, 0xb017 }, { 0x2174, 0x0066, 0xfa0d, 0xb017 }, + { 0x2178, 0x0000, 0x0010, 0xb017 }, { 0x217c, 0x0000, 0x0000, 0xb017 }, + { 0x2180, 0x0000, 0x0000, 0xb017 }, { 0x2184, 0x0000, 0x0000, 0xb017 }, + { 0x2188, 0x0000, 0x0000, 0xb017 }, { 0x218c, 0x0000, 0x0010, 0xb017 }, + { 0x2190, 0x0000, 0x0000, 0xb017 }, { 0x2194, 0x0000, 0x0000, 0xb017 }, + { 0x2198, 0x0000, 0x0000, 0xb017 }, { 0x219c, 0x0000, 0x0000, 0xb017 }, + { 0x21a0, 0x0000, 0x0010, 0xb017 }, { 0x21a4, 0x0000, 0x0000, 0xb017 }, + { 0x21a8, 0x0000, 0x0000, 0xb017 }, { 0x21ac, 0x0000, 0x0000, 0xb017 }, + { 0x21b0, 0x0000, 0x0000, 0xb017 }, { 0x21b4, 0x0000, 0x0010, 0xb017 }, + { 0x21b8, 0x0000, 0x0000, 0xb017 }, { 0x21bc, 0x0000, 0x0000, 0xb017 }, + { 0x21c0, 0x0000, 0x0000, 0xb017 }, { 0x21c4, 0x0000, 0x0000, 0xb017 }, + { 0x23b9, 0x0000, 0x0000, 0xb011 }, { 0x23e0, 0x0000, 0x0020, 0xb011 }, + { 0x23e1, 0x0000, 0x0001, 0xb011 }, + }; + + static const struct alc298_samsung_coeff_seq_desc amp_seq1[] = { + { 0x23ff, 0x0000, 0x0000, 0xb011 }, { 0x203a, 0x0000, 0x0080, 0xb011 }, + }; + + static const struct alc298_samsung_coeff_fixup_desc fixups2[] = { + { 0x4f, 0xb029 }, { 0x05, 0x2be0 }, { 0x30, 0x2421 }, + }; + + + static const struct alc298_samsung_coeff_seq_desc amp_seq2[] = { + { 0x203a, 0x0000, 0x0081, 0xb011 }, { 0x23ff, 0x0000, 0x0001, 0xb011 }, + }; + + if (action != HDA_FIXUP_ACT_INIT) + return; + + // First set of fixups + for (i = 0; i < ARRAY_SIZE(fixups1); i++) + alc_write_coef_idx(codec, fixups1[i].coeff_idx, fixups1[i].coeff_value); + + // First set of writes + alc298_samsung_write_coef_pack_seq(codec, 0x38, amp_0x38, ARRAY_SIZE(amp_0x38)); + alc298_samsung_write_coef_pack_seq(codec, 0x39, amp_0x39, ARRAY_SIZE(amp_0x39)); + alc298_samsung_write_coef_pack_seq(codec, 0x3c, amp_0x3c, ARRAY_SIZE(amp_0x3c)); + alc298_samsung_write_coef_pack_seq(codec, 0x3d, amp_0x3d, ARRAY_SIZE(amp_0x3d)); + + // Second set of writes + alc298_samsung_write_coef_pack_seq(codec, 0x38, amp_seq1, ARRAY_SIZE(amp_seq1)); + alc298_samsung_write_coef_pack_seq(codec, 0x39, amp_seq1, ARRAY_SIZE(amp_seq1)); + alc298_samsung_write_coef_pack_seq(codec, 0x3c, amp_seq1, ARRAY_SIZE(amp_seq1)); + alc298_samsung_write_coef_pack_seq(codec, 0x3d, amp_seq1, ARRAY_SIZE(amp_seq1)); + + // Second set of fixups + for (i = 0; i < ARRAY_SIZE(fixups2); i++) + alc_write_coef_idx(codec, fixups2[i].coeff_idx, fixups2[i].coeff_value); + + // Third set of writes + alc298_samsung_write_coef_pack_seq(codec, 0x38, amp_seq2, ARRAY_SIZE(amp_seq2)); + alc298_samsung_write_coef_pack_seq(codec, 0x39, amp_seq2, ARRAY_SIZE(amp_seq2)); + alc298_samsung_write_coef_pack_seq(codec, 0x3c, amp_seq2, ARRAY_SIZE(amp_seq2)); + alc298_samsung_write_coef_pack_seq(codec, 0x3d, amp_seq2, ARRAY_SIZE(amp_seq2)); + + // Final fixup + alc_write_coef_idx(codec, 0x10, 0x0F21); +} -- cgit From f5c466a0fdb2d9f3650d2e3911b0735f17ba00cf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 23 Jul 2024 17:54:39 +0200 Subject: rbd: rename RBD_LOCK_STATE_RELEASING and releasing_wait ... to RBD_LOCK_STATE_QUIESCING and quiescing_wait to recognize that this state and the associated completion are backing rbd_quiesce_lock(), which isn't specific to releasing the lock. While exclusive lock does get quiesced before it's released, it also gets quiesced before an attempt to update the cookie is made and there the lock is not released as long as ceph_cls_set_cookie() succeeds. Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 26ff5cd2bf0a..c30d227753d7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -362,7 +362,7 @@ enum rbd_watch_state { enum rbd_lock_state { RBD_LOCK_STATE_UNLOCKED, RBD_LOCK_STATE_LOCKED, - RBD_LOCK_STATE_RELEASING, + RBD_LOCK_STATE_QUIESCING, }; /* WatchNotify::ClientId */ @@ -422,7 +422,7 @@ struct rbd_device { struct list_head running_list; struct completion acquire_wait; int acquire_err; - struct completion releasing_wait; + struct completion quiescing_wait; spinlock_t object_map_lock; u8 *object_map; @@ -525,7 +525,7 @@ static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) lockdep_assert_held(&rbd_dev->lock_rwsem); return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || - rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; + rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING; } static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) @@ -3458,12 +3458,12 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req) spin_lock(&rbd_dev->lock_lists_lock); if (!list_empty(&img_req->lock_item)) { list_del_init(&img_req->lock_item); - need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && list_empty(&rbd_dev->running_list)); } spin_unlock(&rbd_dev->lock_lists_lock); if (need_wakeup) - complete(&rbd_dev->releasing_wait); + complete(&rbd_dev->quiescing_wait); } static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) @@ -4181,16 +4181,16 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) /* * Ensure that all in-flight IO is flushed. */ - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; - rbd_assert(!completion_done(&rbd_dev->releasing_wait)); + rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING; + rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); if (list_empty(&rbd_dev->running_list)) return true; up_write(&rbd_dev->lock_rwsem); - wait_for_completion(&rbd_dev->releasing_wait); + wait_for_completion(&rbd_dev->quiescing_wait); down_write(&rbd_dev->lock_rwsem); - if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) + if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING) return false; rbd_assert(list_empty(&rbd_dev->running_list)); @@ -5383,7 +5383,7 @@ static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec) INIT_LIST_HEAD(&rbd_dev->acquiring_list); INIT_LIST_HEAD(&rbd_dev->running_list); init_completion(&rbd_dev->acquire_wait); - init_completion(&rbd_dev->releasing_wait); + init_completion(&rbd_dev->quiescing_wait); spin_lock_init(&rbd_dev->object_map_lock); -- cgit From 2237ceb71f89837ac47c5dce2aaa2c2b3a337a3c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 23 Jul 2024 18:07:59 +0200 Subject: rbd: don't assume RBD_LOCK_STATE_LOCKED for exclusive mappings Every time a watch is reestablished after getting lost, we need to update the cookie which involves quiescing exclusive lock. For this, we transition from RBD_LOCK_STATE_LOCKED to RBD_LOCK_STATE_QUIESCING roughly for the duration of rbd_reacquire_lock() call. If the mapping is exclusive and I/O happens to arrive in this time window, it's failed with EROFS (later translated to EIO) based on the wrong assumption in rbd_img_exclusive_lock() -- "lock got released?" check there stopped making sense with commit a2b1da09793d ("rbd: lock should be quiesced on reacquire"). To make it worse, any such I/O is added to the acquiring list before EROFS is returned and this sets up for violating rbd_lock_del_request() precondition that the request is either on the running list or not on any list at all -- see commit ded080c86b3f ("rbd: don't move requests to the running list on errors"). rbd_lock_del_request() ends up processing these requests as if they were on the running list which screws up quiescing_wait completion counter and ultimately leads to rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); being triggered on the next watch error. Cc: stable@vger.kernel.org # 06ef84c4e9c4: rbd: rename RBD_LOCK_STATE_RELEASING and releasing_wait Cc: stable@vger.kernel.org Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code") Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c30d227753d7..ea6c592e015c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3457,6 +3457,7 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req) lockdep_assert_held(&rbd_dev->lock_rwsem); spin_lock(&rbd_dev->lock_lists_lock); if (!list_empty(&img_req->lock_item)) { + rbd_assert(!list_empty(&rbd_dev->running_list)); list_del_init(&img_req->lock_item); need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && list_empty(&rbd_dev->running_list)); @@ -3476,11 +3477,6 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) if (rbd_lock_add_request(img_req)) return 1; - if (rbd_dev->opts->exclusive) { - WARN_ON(1); /* lock got released? */ - return -EROFS; - } - /* * Note the use of mod_delayed_work() in rbd_acquire_lock() * and cancel_delayed_work() in wake_lock_waiters(). @@ -4601,6 +4597,10 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) rbd_warn(rbd_dev, "failed to update lock cookie: %d", ret); + if (rbd_dev->opts->exclusive) + rbd_warn(rbd_dev, + "temporarily releasing lock on exclusive mapping"); + /* * Lock cookie cannot be updated on older OSDs, so do * a manual release and queue an acquire. -- cgit From 3ceccb14f5576e02b81cc8b105ab81f224bd87f6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 23 Jul 2024 18:08:08 +0200 Subject: rbd: don't assume rbd_is_lock_owner() for exclusive mappings Expanding on the previous commit, assuming that rbd_is_lock_owner() always returns true (i.e. that we are either in RBD_LOCK_STATE_LOCKED or RBD_LOCK_STATE_QUIESCING) if the mapping is exclusive is wrong too. In case ceph_cls_set_cookie() fails, the lock would be temporarily released even if the mapping is exclusive, meaning that we can end up even in RBD_LOCK_STATE_UNLOCKED. IOW, exclusive mappings are really "just" about disabling automatic lock transitions (as documented in the man page), not about grabbing the lock and holding on to it whatever it takes. Cc: stable@vger.kernel.org Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code") Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ea6c592e015c..da22ce38c039 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6589,11 +6589,6 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) if (ret) return ret; - /* - * The lock may have been released by now, unless automatic lock - * transitions are disabled. - */ - rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); return 0; } -- cgit From fc05ea89c9ab45e70cb73e70bc0b9cdd403e0ee1 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Thu, 25 Jul 2024 09:31:14 +0200 Subject: x86/xen: move xen_reserve_extra_memory() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for making the function static. No functional change. Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Message-ID: <20240725073116.14626-2-roger.pau@citrix.com> Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pvh.c | 96 ++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 27a2a02ef8fb..91c6db4ec054 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -27,54 +27,6 @@ bool __ro_after_init xen_pvh; EXPORT_SYMBOL_GPL(xen_pvh); -void __init xen_pvh_init(struct boot_params *boot_params) -{ - u32 msr; - u64 pfn; - - xen_pvh = 1; - xen_domain_type = XEN_HVM_DOMAIN; - xen_start_flags = pvh_start_info.flags; - - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - if (xen_initial_domain()) - x86_init.oem.arch_setup = xen_add_preferred_consoles; - x86_init.oem.banner = xen_banner; - - xen_efi_init(boot_params); - - if (xen_initial_domain()) { - struct xen_platform_op op = { - .cmd = XENPF_get_dom0_console, - }; - int ret = HYPERVISOR_platform_op(&op); - - if (ret > 0) - xen_init_vga(&op.u.dom0_console, - min(ret * sizeof(char), - sizeof(op.u.dom0_console)), - &boot_params->screen_info); - } -} - -void __init mem_map_via_hcall(struct boot_params *boot_params_p) -{ - struct xen_memory_map memmap; - int rc; - - memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table); - set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table); - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc) { - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); - BUG(); - } - boot_params_p->e820_entries = memmap.nr_entries; -} - /* * Reserve e820 UNUSABLE regions to inflate the memory balloon. * @@ -141,3 +93,51 @@ void __init xen_reserve_extra_memory(struct boot_params *bootp) xen_add_extra_mem(PFN_UP(e->addr), pages); } } + +void __init xen_pvh_init(struct boot_params *boot_params) +{ + u32 msr; + u64 pfn; + + xen_pvh = 1; + xen_domain_type = XEN_HVM_DOMAIN; + xen_start_flags = pvh_start_info.flags; + + msr = cpuid_ebx(xen_cpuid_base() + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + if (xen_initial_domain()) + x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.banner = xen_banner; + + xen_efi_init(boot_params); + + if (xen_initial_domain()) { + struct xen_platform_op op = { + .cmd = XENPF_get_dom0_console, + }; + int ret = HYPERVISOR_platform_op(&op); + + if (ret > 0) + xen_init_vga(&op.u.dom0_console, + min(ret * sizeof(char), + sizeof(op.u.dom0_console)), + &boot_params->screen_info); + } +} + +void __init mem_map_via_hcall(struct boot_params *boot_params_p) +{ + struct xen_memory_map memmap; + int rc; + + memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table); + set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + boot_params_p->e820_entries = memmap.nr_entries; +} -- cgit From 4c006734898a113a64a528027274a571b04af95a Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Thu, 25 Jul 2024 09:31:15 +0200 Subject: x86/xen: fix memblock_reserve() usage on PVH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current usage of memblock_reserve() in init_pvh_bootparams() is done before the .bss is zeroed, and that used to be fine when memblock_reserved_init_regions implicitly ended up in the .meminit.data section. However after commit 73db3abdca58c memblock_reserved_init_regions ends up in the .bss section, thus breaking it's usage before the .bss is cleared. Move and rename the call to xen_reserve_extra_memory() so it's done in the x86_init.oem.arch_setup hook, which gets executed after the .bss has been zeroed, but before calling e820__memory_setup(). Fixes: 73db3abdca58c ("init/modpost: conditionally check section mismatch to __meminit*") Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Message-ID: <20240725073116.14626-3-roger.pau@citrix.com> Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/hypervisor.h | 5 ----- arch/x86/platform/pvh/enlighten.c | 3 --- arch/x86/xen/enlighten_pvh.c | 15 ++++++++++++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 64fbd2dbc5b7..a9088250770f 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,11 +62,6 @@ void xen_arch_unregister_cpu(int num); #ifdef CONFIG_PVH void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); -#ifdef CONFIG_XEN_PVH -void __init xen_reserve_extra_memory(struct boot_params *bootp); -#else -static inline void xen_reserve_extra_memory(struct boot_params *bootp) { } -#endif #endif /* Lazy mode for batching updates / context switch */ diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 8c2d4b8de25d..944e0290f2c0 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -75,9 +75,6 @@ static void __init init_pvh_bootparams(bool xen_guest) } else xen_raw_printk("Warning: Can fit ISA range into e820\n"); - if (xen_guest) - xen_reserve_extra_memory(&pvh_bootparams); - pvh_bootparams.hdr.cmd_line_ptr = pvh_start_info.cmdline_paddr; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 91c6db4ec054..728a4366ca85 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -41,8 +42,9 @@ EXPORT_SYMBOL_GPL(xen_pvh); * hypervisor should notify us which memory ranges are suitable for creating * foreign mappings, but that's not yet implemented. */ -void __init xen_reserve_extra_memory(struct boot_params *bootp) +static void __init pvh_reserve_extra_memory(void) { + struct boot_params *bootp = &boot_params; unsigned int i, ram_pages = 0, extra_pages; for (i = 0; i < bootp->e820_entries; i++) { @@ -94,6 +96,14 @@ void __init xen_reserve_extra_memory(struct boot_params *bootp) } } +static void __init pvh_arch_setup(void) +{ + pvh_reserve_extra_memory(); + + if (xen_initial_domain()) + xen_add_preferred_consoles(); +} + void __init xen_pvh_init(struct boot_params *boot_params) { u32 msr; @@ -107,8 +117,7 @@ void __init xen_pvh_init(struct boot_params *boot_params) pfn = __pa(hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - if (xen_initial_domain()) - x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.arch_setup = pvh_arch_setup; x86_init.oem.banner = xen_banner; xen_efi_init(boot_params); -- cgit From 1d9ce4440414c92acb17eece3218fe5c92b141e3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 25 Jul 2024 08:54:28 +0200 Subject: ASoC: amd: yc: Support mic on Lenovo Thinkpad E16 Gen 2 Lenovo Thinkpad E16 Gen 2 AMD model (model 21M5) needs a corresponding quirk entry for making the internal mic working. Link: https://bugzilla.suse.com/show_bug.cgi?id=1228269 Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20240725065442.9293-1-tiwai@suse.de Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index f54466ed8e3e..1769e07e83dc 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -220,6 +220,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "21J6"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21M5"), + } + }, { .driver_data = &acp6x_card, .matches = { -- cgit From 7b52a9d9ee6c5505a51ebe03703a56a72c75f9de Mon Sep 17 00:00:00 2001 From: Huqiang Qin Date: Tue, 9 Jul 2024 16:48:25 +0800 Subject: dt-bindings: watchdog: add support for Amlogic A4 SoCs Update dt-binding document for watchdog of Amlogic A4 SoCs. Signed-off-by: Huqiang Qin Signed-off-by: Xianwei Zhao Acked-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240709-a4-a5_watchdog-v1-1-2ae852e05ec2@amlogic.com Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml b/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml index 69845ec32e81..d0eff1ea52b4 100644 --- a/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml @@ -21,6 +21,7 @@ properties: - amlogic,t7-wdt - items: - enum: + - amlogic,a4-wdt - amlogic,c3-wdt - amlogic,s4-wdt - const: amlogic,t7-wdt -- cgit From 9722c3b66e21ff08aec570d02a97d331087fd70f Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Wed, 24 Jul 2024 18:33:06 +0200 Subject: of: remove internal arguments from of_property_for_each_u32() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The of_property_for_each_u32() macro needs five parameters, two of which are primarily meant as internal variables for the macro itself (in the for() clause). Yet these two parameters are used by a few drivers, and this can be considered misuse or at least bad practice. Now that the kernel uses C11 to build, these two parameters can be avoided by declaring them internally, thus changing this pattern: struct property *prop; const __be32 *p; u32 val; of_property_for_each_u32(np, "xyz", prop, p, val) { ... } to this: u32 val; of_property_for_each_u32(np, "xyz", val) { ... } However two variables cannot be declared in the for clause even with C11, so declare one struct that contain the two variables we actually need. As the variables inside this struct are not meant to be used by users of this macro, give the struct instance the noticeable name "_it" so it is visible during code reviews, helping to avoid new code to use it directly. Most usages are trivially converted as they do not use those two parameters, as expected. The non-trivial cases are: - drivers/clk/clk.c, of_clk_get_parent_name(): easily doable anyway - drivers/clk/clk-si5351.c, si5351_dt_parse(): this is more complex as the checks had to be replicated in a different way, making code more verbose and somewhat uglier, but I refrained from a full rework to keep as much of the original code untouched having no hardware to test my changes All the changes have been build tested. The few for which I have the hardware have been runtime-tested too. Reviewed-by: Andre Przywara # drivers/clk/sunxi/clk-simple-gates.c, drivers/clk/sunxi/clk-sun8i-bus-gates.c Acked-by: Bartosz Golaszewski # drivers/gpio/gpio-brcmstb.c Acked-by: Nicolas Ferre # drivers/irqchip/irq-atmel-aic-common.c Acked-by: Jonathan Cameron # drivers/iio/adc/ti_am335x_adc.c Acked-by: Uwe Kleine-König # drivers/pwm/pwm-samsung.c Acked-by: Richard Leitner # drivers/usb/misc/usb251xb.c Acked-by: Mark Brown # sound/soc/codecs/arizona.c Reviewed-by: Richard Fitzgerald # sound/soc/codecs/arizona.c Acked-by: Michael Ellerman # arch/powerpc/sysdev/xive/spapr.c Acked-by: Stephen Boyd # clk Signed-off-by: Luca Ceresoli Acked-by: Lee Jones Link: https://lore.kernel.org/r/20240724-of_property_for_each_u32-v3-1-bea82ce429e2@bootlin.com Signed-off-by: Rob Herring (Arm) --- arch/powerpc/sysdev/xive/native.c | 4 +-- arch/powerpc/sysdev/xive/spapr.c | 3 +-- drivers/bus/ti-sysc.c | 4 +-- drivers/clk/clk-conf.c | 4 +-- drivers/clk/clk-si5351.c | 43 ++++++++++++++++++++------------- drivers/clk/clk.c | 12 ++++----- drivers/clk/qcom/common.c | 4 +-- drivers/clk/sunxi/clk-simple-gates.c | 4 +-- drivers/clk/sunxi/clk-sun8i-bus-gates.c | 4 +-- drivers/clocksource/samsung_pwm_timer.c | 4 +-- drivers/gpio/gpio-brcmstb.c | 5 +--- drivers/iio/adc/ti_am335x_adc.c | 4 +-- drivers/irqchip/irq-atmel-aic-common.c | 4 +-- drivers/irqchip/irq-pic32-evic.c | 4 +-- drivers/mfd/ti_am335x_tscadc.c | 4 +-- drivers/pinctrl/nxp/pinctrl-s32cc.c | 4 +-- drivers/pinctrl/pinctrl-k210.c | 4 +-- drivers/pwm/pwm-samsung.c | 4 +-- drivers/tty/sysrq.c | 4 +-- drivers/usb/misc/usb251xb.c | 4 +-- include/linux/of.h | 15 ++++++------ sound/soc/codecs/arizona.c | 12 ++++----- 22 files changed, 61 insertions(+), 93 deletions(-) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 517b963e3e6a..a0934b516933 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -559,9 +559,7 @@ bool __init xive_native_init(void) struct device_node *np; struct resource r; void __iomem *tima; - struct property *prop; u8 max_prio = 7; - const __be32 *p; u32 val, cpu; s64 rc; @@ -592,7 +590,7 @@ bool __init xive_native_init(void) max_prio = val - 1; /* Iterate the EQ sizes and pick one */ - of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) { + of_property_for_each_u32(np, "ibm,xive-eq-sizes", val) { xive_queue_shift = val; if (val == PAGE_SHIFT) break; diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index e45419264391..f2fa985a2c77 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -814,7 +814,6 @@ bool __init xive_spapr_init(void) struct device_node *np; struct resource r; void __iomem *tima; - struct property *prop; u8 max_prio; u32 val; u32 len; @@ -866,7 +865,7 @@ bool __init xive_spapr_init(void) } /* Iterate the EQ sizes and pick one */ - of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, reg, val) { + of_property_for_each_u32(np, "ibm,xive-eq-sizes", val) { xive_queue_shift = val; if (val == PAGE_SHIFT) break; diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 8767e04d6c89..2b59ef61dda2 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -2291,11 +2291,9 @@ static int sysc_init_idlemode(struct sysc *ddata, u8 *idlemodes, const char *name) { struct device_node *np = ddata->dev->of_node; - struct property *prop; - const __be32 *p; u32 val; - of_property_for_each_u32(np, name, prop, p, val) { + of_property_for_each_u32(np, name, val) { if (val >= SYSC_NR_IDLEMODES) { dev_err(ddata->dev, "invalid idlemode: %i\n", val); return -EINVAL; diff --git a/drivers/clk/clk-conf.c b/drivers/clk/clk-conf.c index 1a4e6340f95c..058420562020 100644 --- a/drivers/clk/clk-conf.c +++ b/drivers/clk/clk-conf.c @@ -81,13 +81,11 @@ err: static int __set_clk_rates(struct device_node *node, bool clk_supplier) { struct of_phandle_args clkspec; - struct property *prop; - const __be32 *cur; int rc, index = 0; struct clk *clk; u32 rate; - of_property_for_each_u32(node, "assigned-clock-rates", prop, cur, rate) { + of_property_for_each_u32(node, "assigned-clock-rates", rate) { if (rate) { rc = of_parse_phandle_with_args(node, "assigned-clocks", "#clock-cells", index, &clkspec); diff --git a/drivers/clk/clk-si5351.c b/drivers/clk/clk-si5351.c index 4ce83c5265b8..a4c92c5ef3ff 100644 --- a/drivers/clk/clk-si5351.c +++ b/drivers/clk/clk-si5351.c @@ -1175,8 +1175,8 @@ static int si5351_dt_parse(struct i2c_client *client, { struct device_node *child, *np = client->dev.of_node; struct si5351_platform_data *pdata; - struct property *prop; - const __be32 *p; + u32 array[4]; + int sz, i; int num = 0; u32 val; @@ -1191,20 +1191,24 @@ static int si5351_dt_parse(struct i2c_client *client, * property silabs,pll-source : , [<..>] * allow to selectively set pll source */ - of_property_for_each_u32(np, "silabs,pll-source", prop, p, num) { + sz = of_property_read_variable_u32_array(np, "silabs,pll-source", array, 2, 4); + sz = (sz == -EINVAL) ? 0 : sz; /* Missing property is OK */ + if (sz < 0) + return dev_err_probe(&client->dev, sz, "invalid pll-source\n"); + if (sz % 2) + return dev_err_probe(&client->dev, -EINVAL, + "missing pll-source for pll %d\n", array[sz - 1]); + + for (i = 0; i < sz; i += 2) { + num = array[i]; + val = array[i + 1]; + if (num >= 2) { dev_err(&client->dev, "invalid pll %d on pll-source prop\n", num); return -EINVAL; } - p = of_prop_next_u32(prop, p, &val); - if (!p) { - dev_err(&client->dev, - "missing pll-source for pll %d\n", num); - return -EINVAL; - } - switch (val) { case 0: pdata->pll_src[num] = SI5351_PLL_SRC_XTAL; @@ -1232,19 +1236,24 @@ static int si5351_dt_parse(struct i2c_client *client, pdata->pll_reset[0] = true; pdata->pll_reset[1] = true; - of_property_for_each_u32(np, "silabs,pll-reset-mode", prop, p, num) { + sz = of_property_read_variable_u32_array(np, "silabs,pll-reset-mode", array, 2, 4); + sz = (sz == -EINVAL) ? 0 : sz; /* Missing property is OK */ + if (sz < 0) + return dev_err_probe(&client->dev, sz, "invalid pll-reset-mode\n"); + if (sz % 2) + return dev_err_probe(&client->dev, -EINVAL, + "missing pll-reset-mode for pll %d\n", array[sz - 1]); + + for (i = 0; i < sz; i += 2) { + num = array[i]; + val = array[i + 1]; + if (num >= 2) { dev_err(&client->dev, "invalid pll %d on pll-reset-mode prop\n", num); return -EINVAL; } - p = of_prop_next_u32(prop, p, &val); - if (!p) { - dev_err(&client->dev, - "missing pll-reset-mode for pll %d\n", num); - return -EINVAL; - } switch (val) { case 0: diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 8cca52be993f..285ed1ad8a37 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -5364,9 +5364,8 @@ EXPORT_SYMBOL_GPL(of_clk_get_parent_count); const char *of_clk_get_parent_name(const struct device_node *np, int index) { struct of_phandle_args clkspec; - struct property *prop; const char *clk_name; - const __be32 *vp; + bool found = false; u32 pv; int rc; int count; @@ -5383,15 +5382,16 @@ const char *of_clk_get_parent_name(const struct device_node *np, int index) /* if there is an indices property, use it to transfer the index * specified into an array offset for the clock-output-names property. */ - of_property_for_each_u32(clkspec.np, "clock-indices", prop, vp, pv) { + of_property_for_each_u32(clkspec.np, "clock-indices", pv) { if (index == pv) { index = count; + found = true; break; } count++; } /* We went off the end of 'clock-indices' without finding it */ - if (prop && !vp) + if (of_property_present(clkspec.np, "clock-indices") && !found) return NULL; if (of_property_read_string_index(clkspec.np, "clock-output-names", @@ -5504,14 +5504,12 @@ static int parent_ready(struct device_node *np) int of_clk_detect_critical(struct device_node *np, int index, unsigned long *flags) { - struct property *prop; - const __be32 *cur; uint32_t idx; if (!np || !flags) return -EINVAL; - of_property_for_each_u32(np, "clock-critical", prop, cur, idx) + of_property_for_each_u32(np, "clock-critical", idx) if (index == idx) *flags |= CLK_IS_CRITICAL; diff --git a/drivers/clk/qcom/common.c b/drivers/clk/qcom/common.c index ea3788ba46f7..33cc1f73c69d 100644 --- a/drivers/clk/qcom/common.c +++ b/drivers/clk/qcom/common.c @@ -227,11 +227,9 @@ EXPORT_SYMBOL_GPL(qcom_cc_register_sleep_clk); static void qcom_cc_drop_protected(struct device *dev, struct qcom_cc *cc) { struct device_node *np = dev->of_node; - struct property *prop; - const __be32 *p; u32 i; - of_property_for_each_u32(np, "protected-clocks", prop, p, i) { + of_property_for_each_u32(np, "protected-clocks", i) { if (i >= cc->num_rclks) continue; diff --git a/drivers/clk/sunxi/clk-simple-gates.c b/drivers/clk/sunxi/clk-simple-gates.c index 0399627c226a..845efc1ec800 100644 --- a/drivers/clk/sunxi/clk-simple-gates.c +++ b/drivers/clk/sunxi/clk-simple-gates.c @@ -21,11 +21,9 @@ static void __init sunxi_simple_gates_setup(struct device_node *node, { struct clk_onecell_data *clk_data; const char *clk_parent, *clk_name; - struct property *prop; struct resource res; void __iomem *clk_reg; void __iomem *reg; - const __be32 *p; int number, i = 0, j; u8 clk_bit; u32 index; @@ -47,7 +45,7 @@ static void __init sunxi_simple_gates_setup(struct device_node *node, if (!clk_data->clks) goto err_free_data; - of_property_for_each_u32(node, "clock-indices", prop, p, index) { + of_property_for_each_u32(node, "clock-indices", index) { of_property_read_string_index(node, "clock-output-names", i, &clk_name); diff --git a/drivers/clk/sunxi/clk-sun8i-bus-gates.c b/drivers/clk/sunxi/clk-sun8i-bus-gates.c index b87f331f63c9..8482ac8e5898 100644 --- a/drivers/clk/sunxi/clk-sun8i-bus-gates.c +++ b/drivers/clk/sunxi/clk-sun8i-bus-gates.c @@ -24,11 +24,9 @@ static void __init sun8i_h3_bus_gates_init(struct device_node *node) const char *parents[PARENT_MAX]; struct clk_onecell_data *clk_data; const char *clk_name; - struct property *prop; struct resource res; void __iomem *clk_reg; void __iomem *reg; - const __be32 *p; int number, i; u8 clk_bit; int index; @@ -58,7 +56,7 @@ static void __init sun8i_h3_bus_gates_init(struct device_node *node) goto err_free_data; i = 0; - of_property_for_each_u32(node, "clock-indices", prop, p, index) { + of_property_for_each_u32(node, "clock-indices", index) { of_property_read_string_index(node, "clock-output-names", i, &clk_name); diff --git a/drivers/clocksource/samsung_pwm_timer.c b/drivers/clocksource/samsung_pwm_timer.c index 6e46781bc9ac..b9561e3f196c 100644 --- a/drivers/clocksource/samsung_pwm_timer.c +++ b/drivers/clocksource/samsung_pwm_timer.c @@ -418,8 +418,6 @@ void __init samsung_pwm_clocksource_init(void __iomem *base, static int __init samsung_pwm_alloc(struct device_node *np, const struct samsung_pwm_variant *variant) { - struct property *prop; - const __be32 *cur; u32 val; int i, ret; @@ -427,7 +425,7 @@ static int __init samsung_pwm_alloc(struct device_node *np, for (i = 0; i < SAMSUNG_PWM_NUM; ++i) pwm.irq[i] = irq_of_parse_and_map(np, i); - of_property_for_each_u32(np, "samsung,pwm-outputs", prop, cur, val) { + of_property_for_each_u32(np, "samsung,pwm-outputs", val) { if (val >= SAMSUNG_PWM_NUM) { pr_warn("%s: invalid channel index in samsung,pwm-outputs property\n", __func__); continue; diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index 8dce78ea7139..5762e517338e 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -591,8 +591,6 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) void __iomem *reg_base; struct brcmstb_gpio_priv *priv; struct resource *res; - struct property *prop; - const __be32 *p; u32 bank_width; int num_banks = 0; int num_gpios = 0; @@ -636,8 +634,7 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; #endif - of_property_for_each_u32(np, "brcm,gpio-bank-widths", prop, p, - bank_width) { + of_property_for_each_u32(np, "brcm,gpio-bank-widths", bank_width) { struct brcmstb_gpio_bank *bank; struct gpio_chip *gc; diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c index 95fa857e8aad..426e3c9f88a1 100644 --- a/drivers/iio/adc/ti_am335x_adc.c +++ b/drivers/iio/adc/ti_am335x_adc.c @@ -564,13 +564,11 @@ static int tiadc_parse_dt(struct platform_device *pdev, struct tiadc_device *adc_dev) { struct device_node *node = pdev->dev.of_node; - struct property *prop; - const __be32 *cur; int channels = 0; u32 val; int i; - of_property_for_each_u32(node, "ti,adc-channels", prop, cur, val) { + of_property_for_each_u32(node, "ti,adc-channels", val) { adc_dev->channel_line[channels] = val; /* Set Default values for optional DT parameters */ diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 072bd227b6c6..4525366d16d6 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -111,8 +111,6 @@ static void __init aic_common_ext_irq_of_init(struct irq_domain *domain) struct device_node *node = irq_domain_get_of_node(domain); struct irq_chip_generic *gc; struct aic_chip_data *aic; - struct property *prop; - const __be32 *p; u32 hwirq; gc = irq_get_domain_generic_chip(domain, 0); @@ -120,7 +118,7 @@ static void __init aic_common_ext_irq_of_init(struct irq_domain *domain) aic = gc->private; aic->ext_irqs |= 1; - of_property_for_each_u32(node, "atmel,external-irqs", prop, p, hwirq) { + of_property_for_each_u32(node, "atmel,external-irqs", hwirq) { gc = irq_get_domain_generic_chip(domain, hwirq); if (!gc) { pr_warn("AIC: external irq %d >= %d skip it\n", diff --git a/drivers/irqchip/irq-pic32-evic.c b/drivers/irqchip/irq-pic32-evic.c index 1d9bb28d13e5..5d6b8e025bb8 100644 --- a/drivers/irqchip/irq-pic32-evic.c +++ b/drivers/irqchip/irq-pic32-evic.c @@ -190,13 +190,11 @@ static void __init pic32_ext_irq_of_init(struct irq_domain *domain) { struct device_node *node = irq_domain_get_of_node(domain); struct evic_chip_data *priv = domain->host_data; - struct property *prop; - const __le32 *p; u32 hwirq; int i = 0; const char *pname = "microchip,external-irqs"; - of_property_for_each_u32(node, pname, prop, p, hwirq) { + of_property_for_each_u32(node, pname, hwirq) { if (i >= ARRAY_SIZE(priv->ext_irqs)) { pr_warn("More than %d external irq, skip rest\n", ARRAY_SIZE(priv->ext_irqs)); diff --git a/drivers/mfd/ti_am335x_tscadc.c b/drivers/mfd/ti_am335x_tscadc.c index 4bbd542d753e..0c1364d88469 100644 --- a/drivers/mfd/ti_am335x_tscadc.c +++ b/drivers/mfd/ti_am335x_tscadc.c @@ -119,8 +119,6 @@ static int ti_tscadc_probe(struct platform_device *pdev) struct clk *clk; struct device_node *node; struct mfd_cell *cell; - struct property *prop; - const __be32 *cur; bool use_tsc = false, use_mag = false; u32 val; int err; @@ -167,7 +165,7 @@ static int ti_tscadc_probe(struct platform_device *pdev) } node = of_get_child_by_name(pdev->dev.of_node, "adc"); - of_property_for_each_u32(node, "ti,adc-channels", prop, cur, val) { + of_property_for_each_u32(node, "ti,adc-channels", val) { adc_channels++; if (val > 7) { dev_err(&pdev->dev, " PIN numbers are 0..7 (not %d)\n", diff --git a/drivers/pinctrl/nxp/pinctrl-s32cc.c b/drivers/pinctrl/nxp/pinctrl-s32cc.c index df3e5d82da4b..f2609a35c312 100644 --- a/drivers/pinctrl/nxp/pinctrl-s32cc.c +++ b/drivers/pinctrl/nxp/pinctrl-s32cc.c @@ -730,9 +730,7 @@ static int s32_pinctrl_parse_groups(struct device_node *np, struct s32_pin_group *grp, struct s32_pinctrl_soc_info *info) { - const __be32 *p; struct device *dev; - struct property *prop; unsigned int *pins, *sss; int i, npins; u32 pinmux; @@ -763,7 +761,7 @@ static int s32_pinctrl_parse_groups(struct device_node *np, return -ENOMEM; i = 0; - of_property_for_each_u32(np, "pinmux", prop, p, pinmux) { + of_property_for_each_u32(np, "pinmux", pinmux) { pins[i] = get_pin_no(pinmux); sss[i] = get_pin_func(pinmux); diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index 2753e14c3e38..a898e40451fe 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -763,8 +763,6 @@ static int k210_pinctrl_dt_subnode_to_map(struct pinctrl_dev *pctldev, unsigned int *reserved_maps, unsigned int *num_maps) { - struct property *prop; - const __be32 *p; int ret, pinmux_groups; u32 pinmux_group; unsigned long *configs = NULL; @@ -797,7 +795,7 @@ static int k210_pinctrl_dt_subnode_to_map(struct pinctrl_dev *pctldev, if (ret < 0) goto exit; - of_property_for_each_u32(np, "pinmux", prop, p, pinmux_group) { + of_property_for_each_u32(np, "pinmux", pinmux_group) { const char *group_name, *func_name; u32 pin = FIELD_GET(K210_PG_PIN, pinmux_group); u32 func = FIELD_GET(K210_PG_FUNC, pinmux_group); diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c index 7adf4f2b1049..951b38ff5f8e 100644 --- a/drivers/pwm/pwm-samsung.c +++ b/drivers/pwm/pwm-samsung.c @@ -510,8 +510,6 @@ static int pwm_samsung_parse_dt(struct pwm_chip *chip) struct samsung_pwm_chip *our_chip = to_samsung_pwm_chip(chip); struct device_node *np = pwmchip_parent(chip)->of_node; const struct of_device_id *match; - struct property *prop; - const __be32 *cur; u32 val; match = of_match_node(samsung_pwm_matches, np); @@ -520,7 +518,7 @@ static int pwm_samsung_parse_dt(struct pwm_chip *chip) memcpy(&our_chip->variant, match->data, sizeof(our_chip->variant)); - of_property_for_each_u32(np, "samsung,pwm-outputs", prop, cur, val) { + of_property_for_each_u32(np, "samsung,pwm-outputs", val) { if (val >= SAMSUNG_PWM_NUM) { dev_err(pwmchip_parent(chip), "%s: invalid channel index in samsung,pwm-outputs property\n", diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index e5974b8239c9..dab332681036 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -770,8 +770,6 @@ static void sysrq_of_get_keyreset_config(void) { u32 key; struct device_node *np; - struct property *prop; - const __be32 *p; np = of_find_node_by_path("/chosen/linux,sysrq-reset-seq"); if (!np) { @@ -782,7 +780,7 @@ static void sysrq_of_get_keyreset_config(void) /* Reset in case a __weak definition was present */ sysrq_reset_seq_len = 0; - of_property_for_each_u32(np, "keyset", prop, p, key) { + of_property_for_each_u32(np, "keyset", key) { if (key == KEY_RESERVED || key > KEY_MAX || sysrq_reset_seq_len == SYSRQ_KEY_RESET_MAX) break; diff --git a/drivers/usb/misc/usb251xb.c b/drivers/usb/misc/usb251xb.c index b98cda1cef73..e24cdb667307 100644 --- a/drivers/usb/misc/usb251xb.c +++ b/drivers/usb/misc/usb251xb.c @@ -382,11 +382,9 @@ static void usb251xb_get_ports_field(struct usb251xb *hub, bool ds_only, u8 *fld) { struct device *dev = hub->dev; - struct property *prop; - const __be32 *p; u32 port; - of_property_for_each_u32(dev->of_node, prop_name, prop, p, port) { + of_property_for_each_u32(dev->of_node, prop_name, port) { if ((port >= ds_only ? 1 : 0) && (port <= port_cnt)) *fld |= BIT(port); else diff --git a/include/linux/of.h b/include/linux/of.h index 13cf7a43b473..85b60ac9eec5 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -430,11 +430,9 @@ extern int of_detach_node(struct device_node *); #define of_match_ptr(_ptr) (_ptr) /* - * struct property *prop; - * const __be32 *p; * u32 u; * - * of_property_for_each_u32(np, "propname", prop, p, u) + * of_property_for_each_u32(np, "propname", u) * printk("U32 value: %x\n", u); */ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur, @@ -1431,11 +1429,12 @@ static inline int of_property_read_s32(const struct device_node *np, err == 0; \ err = of_phandle_iterator_next(it)) -#define of_property_for_each_u32(np, propname, prop, p, u) \ - for (prop = of_find_property(np, propname, NULL), \ - p = of_prop_next_u32(prop, NULL, &u); \ - p; \ - p = of_prop_next_u32(prop, p, &u)) +#define of_property_for_each_u32(np, propname, u) \ + for (struct {struct property *prop; const __be32 *item; } _it = \ + {of_find_property(np, propname, NULL), \ + of_prop_next_u32(_it.prop, NULL, &u)}; \ + _it.item; \ + _it.item = of_prop_next_u32(_it.prop, _it.item, &u)) #define of_property_for_each_string(np, propname, prop, s) \ for (prop = of_find_property(np, propname, NULL), \ diff --git a/sound/soc/codecs/arizona.c b/sound/soc/codecs/arizona.c index 7434aeeda292..402b9a2ff024 100644 --- a/sound/soc/codecs/arizona.c +++ b/sound/soc/codecs/arizona.c @@ -2786,15 +2786,13 @@ int arizona_of_get_audio_pdata(struct arizona *arizona) { struct arizona_pdata *pdata = &arizona->pdata; struct device_node *np = arizona->dev->of_node; - struct property *prop; - const __be32 *cur; u32 val; u32 pdm_val[ARIZONA_MAX_PDM_SPK]; int ret; int count = 0; count = 0; - of_property_for_each_u32(np, "wlf,inmode", prop, cur, val) { + of_property_for_each_u32(np, "wlf,inmode", val) { if (count == ARRAY_SIZE(pdata->inmode)) break; @@ -2803,7 +2801,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona) } count = 0; - of_property_for_each_u32(np, "wlf,dmic-ref", prop, cur, val) { + of_property_for_each_u32(np, "wlf,dmic-ref", val) { if (count == ARRAY_SIZE(pdata->dmic_ref)) break; @@ -2812,7 +2810,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona) } count = 0; - of_property_for_each_u32(np, "wlf,out-mono", prop, cur, val) { + of_property_for_each_u32(np, "wlf,out-mono", val) { if (count == ARRAY_SIZE(pdata->out_mono)) break; @@ -2821,7 +2819,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona) } count = 0; - of_property_for_each_u32(np, "wlf,max-channels-clocked", prop, cur, val) { + of_property_for_each_u32(np, "wlf,max-channels-clocked", val) { if (count == ARRAY_SIZE(pdata->max_channels_clocked)) break; @@ -2830,7 +2828,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona) } count = 0; - of_property_for_each_u32(np, "wlf,out-volume-limit", prop, cur, val) { + of_property_for_each_u32(np, "wlf,out-volume-limit", val) { if (count == ARRAY_SIZE(pdata->out_vol_limit)) break; -- cgit From 726d4f528dbc98a84d9ce3c749dfdada3dcdd5ca Mon Sep 17 00:00:00 2001 From: Ashish Mhetre Date: Wed, 24 Jul 2024 17:31:32 +0000 Subject: iommu: arm-smmu: Fix Tegra workaround for PAGE_SIZE mappings PAGE_SIZE can be 16KB for Tegra which is not supported by MMU-500 on both Tegra194 and Tegra234. Retain only valid granularities from pgsize_bitmap which would either be 4KB or 64KB. Signed-off-by: Ashish Mhetre Link: https://lore.kernel.org/r/20240724173132.219978-1-amhetre@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c index 4b2994b6126d..2fce4f6d4e1b 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c @@ -277,7 +277,7 @@ static int nvidia_smmu_init_context(struct arm_smmu_domain *smmu_domain, */ if (of_device_is_compatible(np, "nvidia,tegra234-smmu") || of_device_is_compatible(np, "nvidia,tegra194-smmu")) { - smmu->pgsize_bitmap = PAGE_SIZE; + smmu->pgsize_bitmap &= GENMASK(PAGE_SHIFT, 0); pgtbl_cfg->pgsize_bitmap = smmu->pgsize_bitmap; } -- cgit From 36639013b3462c06ff8e3400a427f775b4fc97f5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 25 Jul 2024 10:03:45 +0100 Subject: arm64: mm: Fix lockless walks with static and dynamic page-table folding Lina reports random oopsen originating from the fast GUP code when 16K pages are used with 4-level page-tables, the fourth level being folded at runtime due to lack of LPA2. In this configuration, the generic implementation of p4d_offset_lockless() will return a 'p4d_t *' corresponding to the 'pgd_t' allocated on the stack of the caller, gup_fast_pgd_range(). This is normally fine, but when the fourth level of page-table is folded at runtime, pud_offset_lockless() will offset from the address of the 'p4d_t' to calculate the address of the PUD in the same page-table page. This results in a stray stack read when the 'p4d_t' has been allocated on the stack and can send the walker into the weeds. Fix the problem by providing our own definition of p4d_offset_lockless() when CONFIG_PGTABLE_LEVELS <= 4 which returns the real page-table pointer rather than the address of the local stack variable. Cc: Catalin Marinas Cc: Ard Biesheuvel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/50360968-13fb-4e6f-8f52-1725b3177215@asahilina.net Fixes: 0dd4f60a2c76 ("arm64: mm: Add support for folding PUDs at runtime") Reported-by: Asahi Lina Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240725090345.28461-1-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f8efbc128446..7a4f5604be3f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1065,6 +1065,28 @@ static inline bool pgtable_l5_enabled(void) { return false; } #define p4d_offset_kimg(dir,addr) ((p4d_t *)dir) +static inline +p4d_t *p4d_offset_lockless_folded(pgd_t *pgdp, pgd_t pgd, unsigned long addr) +{ + /* + * With runtime folding of the pud, pud_offset_lockless() passes + * the 'pgd_t *' we return here to p4d_to_folded_pud(), which + * will offset the pointer assuming that it points into + * a page-table page. However, the fast GUP path passes us a + * pgd_t allocated on the stack and so we must use the original + * pointer in 'pgdp' to construct the p4d pointer instead of + * using the generic p4d_offset_lockless() implementation. + * + * Note: reusing the original pointer means that we may + * dereference the same (live) page-table entry multiple times. + * This is safe because it is still only loaded once in the + * context of each level and the CPU guarantees same-address + * read-after-read ordering. + */ + return p4d_offset(pgdp, addr); +} +#define p4d_offset_lockless p4d_offset_lockless_folded + #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #define pgd_ERROR(e) \ -- cgit From ab53dfdcdd1ec8df8729890aefa5b0e3c900afbb Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 25 Jul 2024 11:22:53 +0800 Subject: ASoC: fsl-asoc-card: Dynamically allocate memory for snd_soc_dai_link_components The static snd_soc_dai_link_components cause conflict for multiple instances of this generic driver. For example, when there is wm8962 and SPDIF case enabled together, the contaminated snd_soc_dai_link_components will cause another device probe fail. Fixes: 6d174cc4f224 ("ASoC: fsl-asoc-card: merge spdif support from imx-spdif.c") Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/1721877773-5229-1-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl-asoc-card.c | 46 +++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c index 82df887b3af5..f6c3aeff0d8e 100644 --- a/sound/soc/fsl/fsl-asoc-card.c +++ b/sound/soc/fsl/fsl-asoc-card.c @@ -306,27 +306,12 @@ static int be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, return 0; } -SND_SOC_DAILINK_DEFS(hifi, - DAILINK_COMP_ARRAY(COMP_EMPTY()), - DAILINK_COMP_ARRAY(COMP_EMPTY(), COMP_EMPTY()), - DAILINK_COMP_ARRAY(COMP_EMPTY())); - -SND_SOC_DAILINK_DEFS(hifi_fe, - DAILINK_COMP_ARRAY(COMP_EMPTY()), - DAILINK_COMP_ARRAY(COMP_DUMMY()), - DAILINK_COMP_ARRAY(COMP_EMPTY())); - -SND_SOC_DAILINK_DEFS(hifi_be, - DAILINK_COMP_ARRAY(COMP_EMPTY()), - DAILINK_COMP_ARRAY(COMP_EMPTY(), COMP_EMPTY())); - static const struct snd_soc_dai_link fsl_asoc_card_dai[] = { /* Default ASoC DAI Link*/ { .name = "HiFi", .stream_name = "HiFi", .ops = &fsl_asoc_card_ops, - SND_SOC_DAILINK_REG(hifi), }, /* DPCM Link between Front-End and Back-End (Optional) */ { @@ -335,7 +320,6 @@ static const struct snd_soc_dai_link fsl_asoc_card_dai[] = { .dpcm_playback = 1, .dpcm_capture = 1, .dynamic = 1, - SND_SOC_DAILINK_REG(hifi_fe), }, { .name = "HiFi-ASRC-BE", @@ -345,7 +329,6 @@ static const struct snd_soc_dai_link fsl_asoc_card_dai[] = { .dpcm_playback = 1, .dpcm_capture = 1, .no_pcm = 1, - SND_SOC_DAILINK_REG(hifi_be), }, }; @@ -637,6 +620,7 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) struct platform_device *cpu_pdev; struct fsl_asoc_card_priv *priv; struct device *codec_dev[2] = { NULL, NULL }; + struct snd_soc_dai_link_component *dlc; const char *codec_dai_name[2]; const char *codec_dev_name[2]; u32 asrc_fmt = 0; @@ -717,7 +701,35 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) memcpy(priv->dai_link, fsl_asoc_card_dai, sizeof(struct snd_soc_dai_link) * ARRAY_SIZE(priv->dai_link)); + /* + * "Default ASoC DAI Link": 1 cpus, 2 codecs, 1 platforms + * "DPCM Link Front-End": 1 cpus, 1 codecs (dummy), 1 platforms + * "DPCM Link Back-End": 1 cpus, 2 codecs + * totally 10 components + */ + dlc = devm_kcalloc(&pdev->dev, 10, sizeof(*dlc), GFP_KERNEL); + if (!dlc) { + ret = -ENOMEM; + goto asrc_fail; + } + + priv->dai_link[0].cpus = &dlc[0]; + priv->dai_link[0].num_cpus = 1; + priv->dai_link[0].codecs = &dlc[1]; priv->dai_link[0].num_codecs = 1; + priv->dai_link[0].platforms = &dlc[3]; + priv->dai_link[0].num_platforms = 1; + + priv->dai_link[1].cpus = &dlc[4]; + priv->dai_link[1].num_cpus = 1; + priv->dai_link[1].codecs = &dlc[5]; + priv->dai_link[1].num_codecs = 0; /* dummy */ + priv->dai_link[1].platforms = &dlc[6]; + priv->dai_link[1].num_platforms = 1; + + priv->dai_link[2].cpus = &dlc[7]; + priv->dai_link[2].num_cpus = 1; + priv->dai_link[2].codecs = &dlc[8]; priv->dai_link[2].num_codecs = 1; priv->card.dapm_routes = audio_map; -- cgit From c31fad1470389666ac7169fe43aa65bf5b7e2cfd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 24 Jul 2024 13:31:14 +0300 Subject: nvme-pci: add missing condition check for existence of mapped data nvme_map_data() is called when request has physical segments, hence the nvme_unmap_data() should have same condition to avoid dereference. Fixes: 4aedb705437f ("nvme-pci: split metadata handling from nvme_map_data / nvme_unmap_data") Signed-off-by: Leon Romanovsky Reviewed-by: Christoph Hellwig Reviewed-by: Nitesh Shetty Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8087b054da66..6cd9395ba9ec 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -863,7 +863,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) nvme_start_request(req); return BLK_STS_OK; out_unmap_data: - nvme_unmap_data(dev, req); + if (blk_rq_nr_phys_segments(req)) + nvme_unmap_data(dev, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; -- cgit From 0db4618e8fabfcc404af4dda23799bba726785a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Jul 2024 08:41:35 -0600 Subject: io_uring/msg_ring: fix uninitialized use of target_req->flags syzbot reports that KMSAN complains that 'nr_tw' is an uninit-value with the following report: BUG: KMSAN: uninit-value in io_req_local_work_add io_uring/io_uring.c:1192 [inline] BUG: KMSAN: uninit-value in io_req_task_work_add_remote+0x588/0x5d0 io_uring/io_uring.c:1240 io_req_local_work_add io_uring/io_uring.c:1192 [inline] io_req_task_work_add_remote+0x588/0x5d0 io_uring/io_uring.c:1240 io_msg_remote_post io_uring/msg_ring.c:102 [inline] io_msg_data_remote io_uring/msg_ring.c:133 [inline] io_msg_ring_data io_uring/msg_ring.c:152 [inline] io_msg_ring+0x1c38/0x1ef0 io_uring/msg_ring.c:305 io_issue_sqe+0x383/0x22c0 io_uring/io_uring.c:1710 io_queue_sqe io_uring/io_uring.c:1924 [inline] io_submit_sqe io_uring/io_uring.c:2180 [inline] io_submit_sqes+0x1259/0x2f20 io_uring/io_uring.c:2295 __do_sys_io_uring_enter io_uring/io_uring.c:3205 [inline] __se_sys_io_uring_enter+0x40c/0x3ca0 io_uring/io_uring.c:3142 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3142 x64_sys_call+0x2d82/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is the following check: if (nr_tw < nr_wait) return; in io_req_local_work_add(). While nr_tw itself cannot be uninitialized, it does depend on req->flags, which off the msg ring issue path can indeed be uninitialized. Fix this by always clearing the allocated 'req' fully if we can't grab one from the cache itself. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Reported-by: syzbot+82609b8937a4458106ca@syzkaller.appspotmail.com Link: https://lore.kernel.org/io-uring/000000000000fd3d8d061dfc0e4a@google.com/ Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 29fa9285a33d..7fd9badcfaf8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -110,10 +110,10 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) if (spin_trylock(&ctx->msg_lock)) { req = io_alloc_cache_get(&ctx->msg_cache); spin_unlock(&ctx->msg_lock); + if (req) + return req; } - if (req) - return req; - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } static int io_msg_data_remote(struct io_kiocb *req) -- cgit From f333a3c7e8323499aa65038e77fe8f3199d4e283 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Jul 2024 16:07:07 +0930 Subject: btrfs: tree-checker: validate dref root and objectid [CORRUPTION] There is a bug report that btrfs flips RO due to a corruption in the extent tree, the involved dumps looks like this: item 188 key (402811572224 168 4096) itemoff 14598 itemsize 79 extent refs 3 gen 3678544 flags 1 ref#0: extent data backref root 13835058055282163977 objectid 281473384125923 offset 81432576 count 1 ref#1: shared data backref parent 1947073626112 count 1 ref#2: shared data backref parent 1156030103552 count 1 BTRFS critical (device vdc1: state EA): unable to find ref byte nr 402811572224 parent 0 root 265 owner 28703026 offset 81432576 slot 189 BTRFS error (device vdc1: state EA): failed to run delayed ref for logical 402811572224 num_bytes 4096 type 178 action 2 ref_mod 1: -2 [CAUSE] The corrupted entry is ref#0 of item 188. The root number 13835058055282163977 is beyond the upper limit for root items (the current limit is 1 << 48), and the objectid also looks suspicious. Only the offset and count is correct. [ENHANCEMENT] Although it's still unknown why we have such many bytes corrupted randomly, we can still enhance the tree-checker for data backrefs by: - Validate the root value For now there should only be 3 types of roots can have data backref: * subvolume trees * data reloc trees * root tree Only for v1 space cache - validate the objectid value The objectid should be a valid inode number. Hopefully we can catch such problem in the future with the new checkers. Reported-by: Kai Krakow Link: https://lore.kernel.org/linux-btrfs/CAMthOuPjg5RDT-G_LXeBBUUtzt3cq=JywF+D1_h+JYxe=WKp-Q@mail.gmail.com/#t Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6388786fd8b5..18bb0c930db4 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1289,6 +1289,19 @@ static void extent_err(const struct extent_buffer *eb, int slot, va_end(args); } +static bool is_valid_dref_root(u64 rootid) +{ + /* + * The following tree root objectids are allowed to have a data backref: + * - subvolume trees + * - data reloc tree + * - tree root + * For v1 space cache + */ + return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || + rootid == BTRFS_ROOT_TREE_OBJECTID; +} + static int check_extent_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) @@ -1441,6 +1454,8 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u64 seq; + u64 dref_root; + u64 dref_objectid; u64 dref_offset; u64 inline_offset; u8 inline_type; @@ -1484,11 +1499,26 @@ static int check_extent_item(struct extent_buffer *leaf, */ case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_root = btrfs_extent_data_ref_root(leaf, dref); + dref_objectid = btrfs_extent_data_ref_objectid(leaf, dref); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); seq = hash_extent_data_ref( btrfs_extent_data_ref_root(leaf, dref), btrfs_extent_data_ref_objectid(leaf, dref), btrfs_extent_data_ref_offset(leaf, dref)); + if (unlikely(!is_valid_dref_root(dref_root))) { + extent_err(leaf, slot, + "invalid data ref root value %llu", + dref_root); + return -EUCLEAN; + } + if (unlikely(dref_objectid < BTRFS_FIRST_FREE_OBJECTID || + dref_objectid > BTRFS_LAST_FREE_OBJECTID)) { + extent_err(leaf, slot, + "invalid data ref objectid value %llu", + dref_root); + return -EUCLEAN; + } if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, @@ -1627,6 +1657,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return -EUCLEAN; } for (; ptr < end; ptr += sizeof(*dref)) { + u64 root; + u64 objectid; u64 offset; /* @@ -1634,7 +1666,22 @@ static int check_extent_data_ref(struct extent_buffer *leaf, * overflow from the leaf due to hash collisions. */ dref = (struct btrfs_extent_data_ref *)ptr; + root = btrfs_extent_data_ref_root(leaf, dref); + objectid = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); + if (unlikely(!is_valid_dref_root(root))) { + extent_err(leaf, slot, + "invalid extent data backref root value %llu", + root); + return -EUCLEAN; + } + if (unlikely(objectid < BTRFS_FIRST_FREE_OBJECTID || + objectid > BTRFS_LAST_FREE_OBJECTID)) { + extent_err(leaf, slot, + "invalid extent data backref objectid value %llu", + root); + return -EUCLEAN; + } if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", -- cgit From de9f46cb0044a9b9f825d7695ae235863461dc00 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Jul 2024 13:33:23 +0100 Subject: btrfs: fix corrupt read due to bad offset of a compressed extent map If we attempt to insert a compressed extent map that has a range that overlaps another extent map we have in the inode's extent map tree, we can end up with an incorrect offset after adjusting the new extent map at merge_extent_mapping() because we don't update the extent map's offset. For example consider the following scenario: 1) We have a file extent item for a compressed extent covering the file range [108K, 144K) and currently there's no corresponding extent map in the inode's extent map tree; 2) The inode's size is 141K; 3) We have an encoded write (compressed) into the file range [120K, 128K), which overlaps the existing file extent item. The encoded write creates a matching extent map, adds it to the inode's extent map tree and creates an ordered extent for it. Note that the corresponding file extent item is added to the subvolume tree only when the ordered extent completes (when executing btrfs_finish_one_ordered()); 4) We have a write into the file range [160K, 164K). This writes increases the i_size of the file, and there's a hole between the current i_size (141K) and the start offset of this write, and since the old i_size is in the middle of the block [140K, 144K), we have to write zeroes to the range [141K, 144K) (3072 bytes) and therefore dirty that page. We then call btrfs_set_extent_delalloc() with a start offset of 140K. We then end up at btrfs_find_new_delalloc_bytes() which will call btrfs_get_extent() for the range [140K, 144K); 5) The btrfs_get_extent() doesn't find any extent map in the inode's extent map tree covering the range [140K, 144K), so it searches the subvolume tree for any file extent items covering that range. There it finds the file extent item for the range [108K, 144K), creates a compressed extent map for that range and then calls btrfs_add_extent_mapping() with that extent map and passes the range [140K, 144K) via the "start" and "len" parameters; 6) The call to add_extent_mapping() done by btrfs_add_extent_mapping() fails with -EEXIST because there's an extent map, created at step 2 for the [120K, 128K) range, that covers that overlaps with the range of the given extent map ([108K, 144K)). Then it does a lookup for extent map from step 2 add calls merge_extent_mapping() to adjust the input extent map ([108K, 144K)). That adjust the extent map to a start offset of 128K and a length of 16K (starting just after the extent map from step 2), but it does not update the offset field of the extent map, leaving it with a value of zero instead of updating to a value of 20K (128K - 108K = 20K). As a result any read for the range [128K, 144K) can return incorrect data since we read from a wrong section of the extent (unless both the correct and incorrect ranges happen to have the same data). So fix this by changing merge_extent_mapping() to update the extent map's offset even if it's compressed. Also add a test case to the self tests. This didn't happen before the patchset that does big changes in the extent map structure (which includes the commit in the Fixes tag below) because we kept track of the original start offset in the extent map (member "orig_start") so we could always calculate the correct offset by subtracting that offset from the start offset. A test case for fstests that triggered this problem using send/receive with compressed writes will be added soon. Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 +- fs/btrfs/tests/extent-map-tests.c | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6961cc73fe3f..b180d631545b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -664,7 +664,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, start_diff = start - em->start; em->start = start; em->len = end - start; - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE && !extent_map_is_compressed(em)) + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; return add_extent_mapping(inode, em, 0); } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ebec4ab361b8..56e61ac1cc64 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -900,6 +900,102 @@ out: return ret; } +/* + * Test a regression for compressed extent map adjustment when we attempt to + * add an extent map that is partially overlapped by another existing extent + * map. The resulting extent map offset was left unchanged despite having + * incremented its start offset. + */ +static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + int ret; + int ret2; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + /* Compressed extent for the file range [120K, 128K). */ + em->start = SZ_1K * 120; + em->len = SZ_8K; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_8K; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [120K, 128K)"); + goto out; + } + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* + * Compressed extent for the file range [108K, 144K), which overlaps + * with the [120K, 128K) we previously inserted. + */ + em->start = SZ_1K * 108; + em->len = SZ_1K * 36; + em->disk_num_bytes = SZ_4K; + em->ram_bytes = SZ_1K * 36; + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + + /* + * Try to add the extent map but with a search range of [140K, 144K), + * this should succeed and adjust the extent map to the range + * [128K, 144K), with a length of 16K and an offset of 20K. + * + * This simulates a scenario where in the subvolume tree of an inode we + * have a compressed file extent item for the range [108K, 144K) and we + * have an overlapping compressed extent map for the range [120K, 128K), + * which was created by an encoded write, but its ordered extent was not + * yet completed, so the subvolume tree doesn't have yet the file extent + * item for that range - we only have the extent map in the inode's + * extent map tree. + */ + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("couldn't add extent map for range [108K, 144K)"); + goto out; + } + + if (em->start != SZ_128K) { + test_err("unexpected extent map start %llu (should be 128K)", em->start); + ret = -EINVAL; + goto out; + } + if (em->len != SZ_16K) { + test_err("unexpected extent map length %llu (should be 16K)", em->len); + ret = -EINVAL; + goto out; + } + if (em->offset != SZ_1K * 20) { + test_err("unexpected extent map offset %llu (should be 20K)", em->offset); + ret = -EINVAL; + goto out; + } +out: + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -1076,6 +1172,9 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_7(fs_info, BTRFS_I(inode)); + if (ret) + goto out; + ret = test_case_8(fs_info, BTRFS_I(inode)); if (ret) goto out; -- cgit From 98ba1d931f611e8f8f519c0405fa0a1a76554bfa Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Wed, 24 Jul 2024 15:21:06 -0700 Subject: bnxt_en: Fix RSS logic in __bnxt_reserve_rings() In __bnxt_reserve_rings(), the existing code unconditionally sets the default RSS indirection table to default if netif_is_rxfh_configured() returns false. This used to be correct before we added RSS contexts support. For example, if the user is changing the number of ethtool channels, we will enter this path to reserve the new number of rings. We will then set the RSS indirection table to default to cover the new number of rings if netif_is_rxfh_configured() is false. Now, with RSS contexts support, if the user has added or deleted RSS contexts, we may now enter this path to reserve the new number of VNICs. However, netif_is_rxfh_configured() will not return the correct state if we are still in the middle of set_rxfh(). So the existing code may set the indirection table of the default RSS context to default by mistake. Fix it to check if the reservation of the RX rings is changing. Only check netif_is_rxfh_configured() if it is changing. RX rings will not change in the middle of set_rxfh() and this will fix the issue. Fixes: b3d0083caf9a ("bnxt_en: Support RSS contexts in ethtool .{get|set}_rxfh()") Reported-and-tested-by: Jakub Kicinski Link: https://lore.kernel.org/20240625010210.2002310-1-kuba@kernel.org Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20240724222106.147744-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ffa74c26ee53..23f74c6c88b9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7649,8 +7649,8 @@ static int bnxt_get_avail_msix(struct bnxt *bp, int num); static int __bnxt_reserve_rings(struct bnxt *bp) { struct bnxt_hw_rings hwr = {0}; + int rx_rings, old_rx_rings, rc; int cp = bp->cp_nr_rings; - int rx_rings, rc; int ulp_msix = 0; bool sh = false; int tx_cp; @@ -7684,6 +7684,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp) hwr.grp = bp->rx_nr_rings; hwr.rss_ctx = bnxt_get_total_rss_ctxs(bp, &hwr); hwr.stat = bnxt_get_func_stat_ctxs(bp); + old_rx_rings = bp->hw_resc.resv_rx_rings; rc = bnxt_hwrm_reserve_rings(bp, &hwr); if (rc) @@ -7738,7 +7739,8 @@ static int __bnxt_reserve_rings(struct bnxt *bp) if (!bnxt_rings_ok(bp, &hwr)) return -ENOMEM; - if (!netif_is_rxfh_configured(bp->dev)) + if (old_rx_rings != bp->hw_resc.resv_rx_rings && + !netif_is_rxfh_configured(bp->dev)) bnxt_set_dflt_rss_indir_tbl(bp, NULL); if (!bnxt_ulp_registered(bp->edev) && BNXT_NEW_RM(bp)) { -- cgit From a40c7a24f97edda025f53cfe8f0bc6a6e3c12fa6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jul 2024 16:42:48 -0700 Subject: netlink: specs: correct the spec of ethtool The spec for Ethtool is a bit inaccurate. We don't currently support dump. Context is only accepted as input and not echoed to output (which is a separate bug). Fixes: a353318ebf24 ("tools: ynl: populate most of the ethtool spec") Acked-by: Paolo Abeni Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240724234249.2621109-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 495e35fcfb21..ebbd8dd96b5c 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1753,15 +1753,14 @@ operations: request: attributes: - header + - context reply: attributes: - header - - context - hfunc - indir - hkey - input_xfrm - dump: *rss-get-op - name: plca-get-cfg doc: Get PLCA params. -- cgit From f96aae91b0d260f682e630e092ef70a05a718a43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jul 2024 16:42:49 -0700 Subject: ethtool: rss: echo the context number back The response to a GET request in Netlink should fully identify the queried object. RSS_GET accepts context id as an input, so it must echo that attribute back to the response. After (assuming context 1 has been created): $ ./cli.py --spec netlink/specs/ethtool.yaml \ --do rss-get \ --json '{"header": {"dev-index": 2}, "context": 1}' {'context': 1, 'header': {'dev-index': 2, 'dev-name': 'eth0'}, [...] Fixes: 7112a04664bf ("ethtool: add netlink based get rss support") Acked-by: Paolo Abeni Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240724234249.2621109-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 1 + Documentation/networking/ethtool-netlink.rst | 1 + net/ethtool/rss.c | 8 +++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index ebbd8dd96b5c..ea21fe135b97 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1757,6 +1757,7 @@ operations: reply: attributes: - header + - context - hfunc - indir - hkey diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 3ab423b80e91..d5f246aceb9f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1875,6 +1875,7 @@ Kernel response contents: ===================================== ====== ========================== ``ETHTOOL_A_RSS_HEADER`` nested reply header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number ``ETHTOOL_A_RSS_HFUNC`` u32 RSS hash func ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes ``ETHTOOL_A_RSS_HKEY`` binary Hash key bytes diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 71679137eff2..5c4c4505ab9a 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -111,7 +111,8 @@ rss_reply_size(const struct ethnl_req_info *req_base, const struct rss_reply_data *data = RSS_REPDATA(reply_base); int len; - len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ + len = nla_total_size(sizeof(u32)) + /* _RSS_CONTEXT */ + nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ nla_total_size(data->hkey_size); /* _RSS_HKEY */ @@ -124,6 +125,11 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); + struct rss_req_info *request = RSS_REQINFO(req_base); + + if (request->rss_context && + nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context)) + return -EMSGSIZE; if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || -- cgit From 08f3a5c38087d1569e982a121aad1e6acbf145ce Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 25 Jul 2024 10:29:42 +0800 Subject: net: usb: sr9700: fix uninitialized variable use in sr_mdio_read It could lead to error happen because the variable res is not updated if the call to sr_share_read_word returns an error. In this particular case error code was returned and res stayed uninitialized. Same issue also applies to sr_read_reg. This can be avoided by checking the return value of sr_share_read_word and sr_read_reg, and propagating the error if the read operation failed. Found by code review. Cc: stable@vger.kernel.org Fixes: c9b37458e956 ("USB2NET : SR9700 : One chip USB 1.1 USB2NET SR9700Device Driver Support") Signed-off-by: Ma Ke Reviewed-by: Shigeru Yoshida Reviewed-by: Hariprasad Kelam Signed-off-by: David S. Miller --- drivers/net/usb/sr9700.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index 0a662e42ed96..cb7d2f798fb4 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -179,6 +179,7 @@ static int sr_mdio_read(struct net_device *netdev, int phy_id, int loc) struct usbnet *dev = netdev_priv(netdev); __le16 res; int rc = 0; + int err; if (phy_id) { netdev_dbg(netdev, "Only internal phy supported\n"); @@ -189,11 +190,17 @@ static int sr_mdio_read(struct net_device *netdev, int phy_id, int loc) if (loc == MII_BMSR) { u8 value; - sr_read_reg(dev, SR_NSR, &value); + err = sr_read_reg(dev, SR_NSR, &value); + if (err < 0) + return err; + if (value & NSR_LINKST) rc = 1; } - sr_share_read_word(dev, 1, loc, &res); + err = sr_share_read_word(dev, 1, loc, &res); + if (err < 0) + return err; + if (rc == 1) res = le16_to_cpu(res) | BMSR_LSTATUS; else -- cgit From 2191a54f63225b548fd8346be3611c3219a24738 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Jul 2024 09:27:45 +0000 Subject: sched: act_ct: take care of padding in struct zones_ht_key Blamed commit increased lookup key size from 2 bytes to 16 bytes, because zones_ht_key got a struct net pointer. Make sure rhashtable_lookup() is not using the padding bytes which are not initialized. BUG: KMSAN: uninit-value in rht_ptr_rcu include/linux/rhashtable.h:376 [inline] BUG: KMSAN: uninit-value in __rhashtable_lookup include/linux/rhashtable.h:607 [inline] BUG: KMSAN: uninit-value in rhashtable_lookup include/linux/rhashtable.h:646 [inline] BUG: KMSAN: uninit-value in rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] BUG: KMSAN: uninit-value in tcf_ct_flow_table_get+0x611/0x2260 net/sched/act_ct.c:329 rht_ptr_rcu include/linux/rhashtable.h:376 [inline] __rhashtable_lookup include/linux/rhashtable.h:607 [inline] rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] tcf_ct_flow_table_get+0x611/0x2260 net/sched/act_ct.c:329 tcf_ct_init+0xa67/0x2890 net/sched/act_ct.c:1408 tcf_action_init_1+0x6cc/0xb30 net/sched/act_api.c:1425 tcf_action_init+0x458/0xf00 net/sched/act_api.c:1488 tcf_action_add net/sched/act_api.c:2061 [inline] tc_ctl_action+0x4be/0x19d0 net/sched/act_api.c:2118 rtnetlink_rcv_msg+0x12fc/0x1410 net/core/rtnetlink.c:6647 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2550 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6665 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2597 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2651 __sys_sendmsg net/socket.c:2680 [inline] __do_sys_sendmsg net/socket.c:2689 [inline] __se_sys_sendmsg net/socket.c:2687 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2687 x64_sys_call+0x2dd6/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Local variable key created at: tcf_ct_flow_table_get+0x4a/0x2260 net/sched/act_ct.c:324 tcf_ct_init+0xa67/0x2890 net/sched/act_ct.c:1408 Fixes: 88c67aeb1407 ("sched: act_ct: add netns into the key of tcf_ct_flow_table") Reported-by: syzbot+1b5e4e187cc586d05ea0@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Xin Long Reviewed-by: Simon Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sched/act_ct.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 113b907da0f7..3ba8e7e739b5 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -44,6 +44,8 @@ static DEFINE_MUTEX(zones_mutex); struct zones_ht_key { struct net *net; u16 zone; + /* Note : pad[] must be the last field. */ + u8 pad[]; }; struct tcf_ct_flow_table { @@ -60,7 +62,7 @@ struct tcf_ct_flow_table { static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), .key_offset = offsetof(struct tcf_ct_flow_table, key), - .key_len = sizeof_field(struct tcf_ct_flow_table, key), + .key_len = offsetof(struct zones_ht_key, pad), .automatic_shrinking = true, }; -- cgit From 8f4fa0876231c426f880a2bff25ac49fac67d805 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 25 Jul 2024 18:48:36 +0200 Subject: wifi: mac80211: use monitor sdata with driver only if desired In commit 0d9c2beed116 ("wifi: mac80211: fix monitor channel with chanctx emulation") I changed mac80211 to always have an internal monitor_sdata to have something to have the chanctx bound to. However, if the driver didn't also have the WANT_MONITOR flag this would cause mac80211 to allocate it without telling the driver (which was intentional) but also use it for later APIs to the driver without it ever having known about it which was _not_ intentional. Check through the code and only use the monitor_sdata in the relevant places (TX, MU-MIMO follow settings, TX power, and interface iteration) when the WANT_MONITOR flag is set. Cc: stable@vger.kernel.org Fixes: 0d9c2beed116 ("wifi: mac80211: fix monitor channel with chanctx emulation") Reported-by: ZeroBeat Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219086 Tested-by: Lorenzo Bianconi Link: https://patch.msgid.link/20240725184836.25d334157a8e.I02574086da2c5cf0e18264ce5807db6f14ffd9c0@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 7 +++++-- net/mac80211/tx.c | 5 +++-- net/mac80211/util.c | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 85cb71de370f..b02b84ce2130 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -114,7 +114,7 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* apply all changes now - no failures allowed */ - if (monitor_sdata) + if (monitor_sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { @@ -3053,6 +3053,9 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return -EOPNOTSUPP; + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) @@ -3115,7 +3118,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 72a9ba8bc5fd..edba4a31844f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1768,7 +1768,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } sdata = rcu_dereference(local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; @@ -3957,7 +3957,8 @@ begin: break; } tx.sdata = rcu_dereference(local->monitor_sdata); - if (tx.sdata) { + if (tx.sdata && + ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &tx.sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ced19ce7c51a..c7ad9bc5973a 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -776,7 +776,7 @@ static void __iterate_interfaces(struct ieee80211_local *local, sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); - if (sdata && + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); -- cgit From baeaabf970b9a90999f62ae27edf63f6cb86c023 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 24 Jul 2024 18:23:27 +0530 Subject: wifi: cfg80211: fix reporting failed MLO links status with cfg80211_connect_done Individual MLO links connection status is not copied to EVENT_CONNECT_RESULT data while processing the connect response information in cfg80211_connect_done(). Due to this failed links are wrongly indicated with success status in EVENT_CONNECT_RESULT. To fix this, copy the individual MLO links status to the EVENT_CONNECT_RESULT data. Fixes: 53ad07e9823b ("wifi: cfg80211: support reporting failed links") Signed-off-by: Veerendranath Jakkam Reviewed-by: Carlos Llamas Link: https://patch.msgid.link/20240724125327.3495874-1-quic_vjakkam@quicinc.com [commit message editorial changes] Signed-off-by: Johannes Berg --- net/wireless/sme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index e419aa8c4a5a..d9d7bf8bb5c1 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1045,6 +1045,7 @@ void cfg80211_connect_done(struct net_device *dev, cfg80211_hold_bss( bss_from_pub(params->links[link].bss)); ev->cr.links[link].bss = params->links[link].bss; + ev->cr.links[link].status = params->links[link].status; if (params->links[link].addr) { ev->cr.links[link].addr = next; -- cgit From 6873cc4416078202882691b424fcca5b5fb1a94d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 24 Jul 2024 13:29:12 +0200 Subject: wifi: cfg80211: correct S1G beacon length calculation The minimum header length calculation (equivalent to the start of the elements) for the S1G long beacon erroneously required only up to the start of u.s1g_beacon rather than the start of u.s1g_beacon.variable. Fix that, and also shuffle the branches around a bit to not assign useless values that are overwritten later. Reported-by: syzbot+0f3afa93b91202f21939@syzkaller.appspotmail.com Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results") Link: https://patch.msgid.link/20240724132912.9662972db7c1.I8779675b5bbda4994cc66f876b6b87a2361c3c0b@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d99319d82205..64eeed82d43d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3178,8 +3178,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp) { - size_t min_hdr_len = offsetof(struct ieee80211_mgmt, - u.probe_resp.variable); + size_t min_hdr_len; struct ieee80211_ext *ext = NULL; enum cfg80211_bss_frame_type ftype; u16 beacon_interval; @@ -3202,10 +3201,16 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; - min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); + else + min_hdr_len = offsetof(struct ieee80211_ext, + u.s1g_beacon.variable); + } else { + /* same for beacons */ + min_hdr_len = offsetof(struct ieee80211_mgmt, + u.probe_resp.variable); } if (WARN_ON(len < min_hdr_len)) -- cgit From 189d7aae8f5a100b0db8b302debbd445475d01e6 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Mon, 22 Jul 2024 11:33:32 +0800 Subject: wifi: ath12k: fix reusing outside iterator in ath12k_wow_vif_set_wakeups() Smatch throws below warning: drivers/net/wireless/ath/ath12k/wow.c:434 ath12k_wow_vif_set_wakeups() warn: reusing outside iterator: 'i' drivers/net/wireless/ath/ath12k/wow.c 411 default: 412 break; 413 } 414 415 for (i = 0; i < wowlan->n_patterns; i++) { ^^^^^^^^^^^^^^^^^^^^^^ Here we loop until ->n_patterns 416 const struct cfg80211_pkt_pattern *eth_pattern = &patterns[i]; 417 struct ath12k_pkt_pattern new_pattern = {}; 418 419 if (WARN_ON(eth_pattern->pattern_len > WOW_MAX_PATTERN_SIZE)) 420 return -EINVAL; 421 422 if (ar->ab->wow.wmi_conf_rx_decap_mode == 423 ATH12K_HW_TXRX_NATIVE_WIFI) { 424 ath12k_wow_convert_8023_to_80211(ar, eth_pattern, 425 &new_pattern); 426 427 if (WARN_ON(new_pattern.pattern_len > WOW_MAX_PATTERN_SIZE)) 428 return -EINVAL; 429 } else { 430 memcpy(new_pattern.pattern, eth_pattern->pattern, 431 eth_pattern->pattern_len); 432 433 /* convert bitmask to bytemask */ --> 434 for (i = 0; i < eth_pattern->pattern_len; i++) 435 if (eth_pattern->mask[i / 8] & BIT(i % 8)) 436 new_pattern.bytemask[i] = 0xff; This loop re-uses i and the loop ends with i == eth_pattern->pattern_len. This looks like a bug. Change to use a new iterator 'j' for the inner loop to fix it. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 Fixes: 4a3c212eee0e ("wifi: ath12k: add basic WoW functionalities") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/d4975b95-9c43-45af-a0ab-80253f18c7f2@stanley.mountain/ Signed-off-by: Baochen Qiang Acked-by: Jeff Johnson Link: https://patch.msgid.link/20240722033332.6273-1-quic_bqiang@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath12k/wow.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wow.c b/drivers/net/wireless/ath/ath12k/wow.c index bead19db2c9a..9b8684abbe40 100644 --- a/drivers/net/wireless/ath/ath12k/wow.c +++ b/drivers/net/wireless/ath/ath12k/wow.c @@ -361,7 +361,7 @@ static int ath12k_wow_vif_set_wakeups(struct ath12k_vif *arvif, struct ath12k *ar = arvif->ar; unsigned long wow_mask = 0; int pattern_id = 0; - int ret, i; + int ret, i, j; /* Setup requested WOW features */ switch (arvif->vdev_type) { @@ -431,9 +431,9 @@ static int ath12k_wow_vif_set_wakeups(struct ath12k_vif *arvif, eth_pattern->pattern_len); /* convert bitmask to bytemask */ - for (i = 0; i < eth_pattern->pattern_len; i++) - if (eth_pattern->mask[i / 8] & BIT(i % 8)) - new_pattern.bytemask[i] = 0xff; + for (j = 0; j < eth_pattern->pattern_len; j++) + if (eth_pattern->mask[j / 8] & BIT(j % 8)) + new_pattern.bytemask[j] = 0xff; new_pattern.pattern_len = eth_pattern->pattern_len; new_pattern.pkt_offset = eth_pattern->pkt_offset; -- cgit From 6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Thu, 18 Jul 2024 16:46:33 -0700 Subject: wifi: mt76: mt7921: fix null pointer access in mt792x_mac_link_bss_remove Fix null pointer access in mt792x_mac_link_bss_remove. To prevent null pointer access, we should assign the vif to bss_conf in mt7921_add_interface. This ensures that subsequent operations on the BSS can properly reference the correct vif. [ T843] Call Trace: [ T843] [ T843] ? __die+0x1e/0x60 [ T843] ? page_fault_oops+0x157/0x450 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? search_bpf_extables+0x5a/0x80 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? exc_page_fault+0x2bb/0x670 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? lock_timer_base+0x71/0x90 [ T843] ? asm_exc_page_fault+0x26/0x30 [ T843] ? mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib] [ T843] ? mt792x_remove_interface+0x6e/0x90 [mt792x_lib] [ T843] ? ieee80211_do_stop+0x507/0x7e0 [mac80211] [ T843] ? ieee80211_stop+0x53/0x190 [mac80211] [ T843] ? __dev_close_many+0xa5/0x120 [ T843] ? __dev_change_flags+0x18c/0x220 [ T843] ? dev_change_flags+0x21/0x60 [ T843] ? do_setlink+0xdf9/0x11d0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? security_sock_rcv_skb+0x33/0x50 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? __nla_validate_parse+0x61/0xd10 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? genl_done+0x53/0x80 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? netlink_dump+0x357/0x410 [ T843] ? __rtnl_newlink+0x5d6/0x980 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? genl_family_rcv_msg_dumpit+0xdf/0xf0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? __kmalloc_cache_noprof+0x44/0x210 [ T843] ? rtnl_newlink+0x42/0x60 [ T843] ? rtnetlink_rcv_msg+0x152/0x3f0 [ T843] ? mptcp_pm_nl_dump_addr+0x180/0x180 [ T843] ? rtnl_calcit.isra.0+0x130/0x130 [ T843] ? netlink_rcv_skb+0x56/0x100 [ T843] ? netlink_unicast+0x199/0x290 [ T843] ? netlink_sendmsg+0x21d/0x490 [ T843] ? __sock_sendmsg+0x78/0x80 [ T843] ? ____sys_sendmsg+0x23f/0x2e0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? copy_msghdr_from_user+0x68/0xa0 [ T843] ? ___sys_sendmsg+0x81/0xd0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? crng_fast_key_erasure+0xbc/0xf0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? get_random_bytes_user+0x126/0x140 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? __fdget+0xb1/0xe0 [ T843] ? __sys_sendmsg+0x56/0xa0 [ T843] ? srso_alias_return_thunk+0x5/0xfbef5 [ T843] ? do_syscall_64+0x5f/0x170 [ T843] ? entry_SYSCALL_64_after_hwframe+0x55/0x5d [ T843] Fixes: 1541d63c5fe2 ("wifi: mt76: mt7925: add mt7925_mac_link_bss_remove to remove per-link BSS") Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/linux-wireless/2fee61f8c903d02a900ca3188c3742c7effd102e.camel@web.de/#b Signed-off-by: Sean Wang Tested-by: Bert Karwatzki Link: https://patch.msgid.link/20240718234633.12737-1-sean.wang@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 2e6268cb06c0..1bab93d049df 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx; mvif->phy = phy; + mvif->bss_conf.vif = mvif; mvif->bss_conf.mt76.band_idx = 0; mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx % MT76_CONNAC_MAX_WMM_SETS; -- cgit From a47f3320bb4ba6714abe8dddb36399367b491358 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 9 Jul 2024 09:31:32 +0200 Subject: wifi: ath12k: fix soft lockup on suspend The ext interrupts are enabled when the firmware has been started, but this may never happen, for example, if the board configuration file is missing. When the system is later suspended, the driver unconditionally tries to disable interrupts, which results in an irq disable imbalance and causes the driver to spin indefinitely in napi_synchronize(). Make sure that the interrupts have been enabled before attempting to disable them. Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices") Cc: stable@vger.kernel.org # 6.3 Signed-off-by: Johan Hovold Acked-by: Jeff Johnson Link: https://patch.msgid.link/20240709073132.9168-1-johan+linaro@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath12k/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/pci.c b/drivers/net/wireless/ath/ath12k/pci.c index 876c029f58f6..9e0b9e329bda 100644 --- a/drivers/net/wireless/ath/ath12k/pci.c +++ b/drivers/net/wireless/ath/ath12k/pci.c @@ -473,7 +473,8 @@ static void __ath12k_pci_ext_irq_disable(struct ath12k_base *ab) { int i; - clear_bit(ATH12K_FLAG_EXT_IRQ_ENABLED, &ab->dev_flags); + if (!test_and_clear_bit(ATH12K_FLAG_EXT_IRQ_ENABLED, &ab->dev_flags)) + return; for (i = 0; i < ATH12K_EXT_IRQ_GRP_NUM_MAX; i++) { struct ath12k_ext_irq_grp *irq_grp = &ab->ext_irq_grp[i]; -- cgit From 9c421ef3f6b30ab912eaaa3c3d20cfb921fd8c8f Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 18 Jul 2024 16:32:43 +0800 Subject: erofs: support STATX_DIOALIGN Add support for STATX_DIOALIGN to EROFS, so that direct I/O alignment restrictions are exposed to userspace in a generic way. [Before] ``` ./statx_test /mnt/erofs/testfile statx(/mnt/erofs/testfile) = 0 dio mem align:0 dio offset align:0 ``` [After] ``` ./statx_test /mnt/erofs/testfile statx(/mnt/erofs/testfile) = 0 dio mem align:512 dio offset align:512 ``` Signed-off-by: Hongbo Li Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240718083243.2485437-1-hsiangkao@linux.alibaba.com --- fs/erofs/inode.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 5f6439a63af7..43c09aae2afc 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -334,14 +334,29 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); + bool compressed = + erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout); - if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) + if (compressed) stat->attributes |= STATX_ATTR_COMPRESSED; - stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); + /* + * Return the DIO alignment restrictions if requested. + * + * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and + * compressed files, so in these cases we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN; + if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) { + stat->dio_mem_align = + bdev_logical_block_size(inode->i_sb->s_bdev); + stat->dio_offset_align = stat->dio_mem_align; + } + } generic_fillattr(idmap, request_mask, inode, stat); return 0; } -- cgit From 7dc5537c3f8be87e005f0844a7626c987914f8fd Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 22 Jul 2024 11:51:10 +0800 Subject: erofs: fix race in z_erofs_get_gbuf() In z_erofs_get_gbuf(), the current task may be migrated to another CPU between `z_erofs_gbuf_id()` and `spin_lock(&gbuf->lock)`. Therefore, z_erofs_put_gbuf() will trigger the following issue which was found by stress test: <2>[772156.434168] kernel BUG at fs/erofs/zutil.c:58! .. <4>[772156.435007] <4>[772156.439237] CPU: 0 PID: 3078 Comm: stress Kdump: loaded Tainted: G E 6.10.0-rc7+ #2 <4>[772156.439239] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 1.0.0 01/01/2017 <4>[772156.439241] pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) <4>[772156.439243] pc : z_erofs_put_gbuf+0x64/0x70 [erofs] <4>[772156.439252] lr : z_erofs_lz4_decompress+0x600/0x6a0 [erofs] .. <6>[772156.445958] stress (3127): drop_caches: 1 <4>[772156.446120] Call trace: <4>[772156.446121] z_erofs_put_gbuf+0x64/0x70 [erofs] <4>[772156.446761] z_erofs_lz4_decompress+0x600/0x6a0 [erofs] <4>[772156.446897] z_erofs_decompress_queue+0x740/0xa10 [erofs] <4>[772156.447036] z_erofs_runqueue+0x428/0x8c0 [erofs] <4>[772156.447160] z_erofs_readahead+0x224/0x390 [erofs] .. Fixes: f36f3010f676 ("erofs: rename per-CPU buffers to global buffer pool and make it configurable") Cc: # 6.10+ Reviewed-by: Chunhai Guo Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240722035110.3456740-1-hsiangkao@linux.alibaba.com --- fs/erofs/zutil.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index b80f612867c2..9b53883e5caf 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -38,11 +38,13 @@ void *z_erofs_get_gbuf(unsigned int requiredpages) { struct z_erofs_gbuf *gbuf; + migrate_disable(); gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; spin_lock(&gbuf->lock); /* check if the buffer is too small */ if (requiredpages > gbuf->nrpages) { spin_unlock(&gbuf->lock); + migrate_enable(); /* (for sparse checker) pretend gbuf->lock is still taken */ __acquire(gbuf->lock); return NULL; @@ -57,6 +59,7 @@ void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock) gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; DBG_BUGON(gbuf->ptr != ptr); spin_unlock(&gbuf->lock); + migrate_enable(); } int z_erofs_gbuf_growsize(unsigned int nrpages) -- cgit From 684b290abc774202ff88897648f24520f40c916b Mon Sep 17 00:00:00 2001 From: Huang Xiaojia Date: Sat, 20 Jul 2024 16:23:35 +0800 Subject: erofs: add support for FS_IOC_GETFSSYSFSPATH FS_IOC_GETFSSYSFSPATH ioctl exposes /sys/fs path of a given filesystem, potentially standarizing sysfs reporting. This patch add support for FS_IOC_GETFSSYSFSPATH for erofs, "erofs/" will be outputted for bdev cases, "erofs/[domain_id,]" will be outputted for fscache cases. Signed-off-by: Huang Xiaojia Link: https://lore.kernel.org/r/20240720082335.441563-1-huangxiaojia2@huawei.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 35268263aaed..32ce5b35e1df 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -576,6 +576,21 @@ static const struct export_operations erofs_export_ops = { .get_parent = erofs_get_parent, }; +static void erofs_set_sysfs_name(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (erofs_is_fscache_mode(sb)) { + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id, + sbi->fsid); + else + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + return; + } + super_set_sysfs_name_id(sb); +} + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -643,6 +658,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; + erofs_set_sysfs_name(sb); #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); -- cgit From 5d3bb77e5fce1d224b94da31abae0a7afed54735 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 23 Jul 2024 15:30:24 +0800 Subject: erofs: support multi-page folios for erofs_bread() If the requested page is part of the previous multi-page folio, there is no need to call read_mapping_folio() again. Also, get rid of the remaining one of page->index [1] in our codebase. [1] https://lore.kernel.org/r/Zp8fgUSIBGQ1TN0D@casper.infradead.org Cc: Matthew Wilcox Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240723073024.875290-1-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8be60797ea2f..1b7eba38ba1e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -21,38 +21,32 @@ void erofs_put_metabuf(struct erofs_buf *buf) if (!buf->page) return; erofs_unmap_metabuf(buf); - put_page(buf->page); + folio_put(page_folio(buf->page)); buf->page = NULL; } -/* - * Derive the block size from inode->i_blkbits to make compatible with - * anonymous inode in fscache mode. - */ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type) { pgoff_t index = offset >> PAGE_SHIFT; - struct page *page = buf->page; - struct folio *folio; - unsigned int nofs_flag; + struct folio *folio = NULL; - if (!page || page->index != index) { + if (buf->page) { + folio = page_folio(buf->page); + if (folio_file_page(folio, index) != buf->page) + erofs_unmap_metabuf(buf); + } + if (!folio || !folio_contains(folio, index)) { erofs_put_metabuf(buf); - - nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(buf->mapping, index, NULL, NULL); - memalloc_nofs_restore(nofs_flag); + folio = read_mapping_folio(buf->mapping, index, NULL); if (IS_ERR(folio)) return folio; - - /* should already be PageUptodate, no need to lock page */ - page = folio_file_page(folio, index); - buf->page = page; } + buf->page = folio_file_page(folio, index); + if (buf->kmap_type == EROFS_NO_KMAP) { if (type == EROFS_KMAP) - buf->base = kmap_local_page(page); + buf->base = kmap_local_page(buf->page); buf->kmap_type = type; } else if (buf->kmap_type != type) { DBG_BUGON(1); -- cgit From 14e9283fb22d0d259820a5f05c6059678bab9ac5 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 24 Jul 2024 10:07:21 +0800 Subject: erofs: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20240724020721.2389738-1-nichen@iscas.ac.cn Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/decompressor_lzma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 06a722b85a45..40666815046f 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -188,7 +188,7 @@ again: !rq->partial_decoding); buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in); rq->inputsize -= buf.in_size; - buf.in = dctx.kin + rq->pageofs_in, + buf.in = dctx.kin + rq->pageofs_in; dctx.bounce = strm->bounce; do { dctx.avail_out = buf.out_size - buf.out_pos; -- cgit From 9bc7501b0b90f4d0c34b97c14ff1f708ce7ad8f3 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 3 Jun 2024 11:15:27 -0400 Subject: i3c: master: svc: resend target address when get NACK According to I3C Spec 1.1.1, 11-Jun-2021, section: 5.1.2.2.3: If the Controller chooses to start an I3C Message with an I3C Dynamic Address, then special provisions shall be made because that same I3C Target may be initiating an IBI or a Controller Role Request. So, one of three things may happen: (skip 1, 2) 3. The Addresses match and the RnW bits also match, and so neither Controller nor Target will ACK since both are expecting the other side to provide ACK. As a result, each side might think it had "won" arbitration, but neither side would continue, as each would subsequently see that the other did not provide ACK. ... For either value of RnW: Due to the NACK, the Controller shall defer the Private Write or Private Read, and should typically transmit the Target ^^^^^^^^^^^^^^^^^^^ Address again after a Repeated START (i.e., the next one or any one prior ^^^^^^^^^^^^^ to a STOP in the Frame). Since the Address Header following a Repeated START is not arbitrated, the Controller will always win (see Section 5.1.2.2.4). Resend target address again if address is not 7E and controller get NACK. Reviewed-by: Miquel Raynal Signed-off-by: Frank Li Signed-off-by: Alexandre Belloni --- drivers/i3c/master/svc-i3c-master.c | 58 ++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index bb299ce02ccc..f0362509319e 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -1052,29 +1052,59 @@ static int svc_i3c_master_xfer(struct svc_i3c_master *master, u8 *in, const u8 *out, unsigned int xfer_len, unsigned int *actual_len, bool continued) { + int retry = 2; u32 reg; int ret; /* clean SVC_I3C_MINT_IBIWON w1c bits */ writel(SVC_I3C_MINT_IBIWON, master->regs + SVC_I3C_MSTATUS); - writel(SVC_I3C_MCTRL_REQUEST_START_ADDR | - xfer_type | - SVC_I3C_MCTRL_IBIRESP_NACK | - SVC_I3C_MCTRL_DIR(rnw) | - SVC_I3C_MCTRL_ADDR(addr) | - SVC_I3C_MCTRL_RDTERM(*actual_len), - master->regs + SVC_I3C_MCTRL); - ret = readl_poll_timeout(master->regs + SVC_I3C_MSTATUS, reg, + while (retry--) { + writel(SVC_I3C_MCTRL_REQUEST_START_ADDR | + xfer_type | + SVC_I3C_MCTRL_IBIRESP_NACK | + SVC_I3C_MCTRL_DIR(rnw) | + SVC_I3C_MCTRL_ADDR(addr) | + SVC_I3C_MCTRL_RDTERM(*actual_len), + master->regs + SVC_I3C_MCTRL); + + ret = readl_poll_timeout(master->regs + SVC_I3C_MSTATUS, reg, SVC_I3C_MSTATUS_MCTRLDONE(reg), 0, 1000); - if (ret) - goto emit_stop; + if (ret) + goto emit_stop; - if (readl(master->regs + SVC_I3C_MERRWARN) & SVC_I3C_MERRWARN_NACK) { - ret = -ENXIO; - *actual_len = 0; - goto emit_stop; + if (readl(master->regs + SVC_I3C_MERRWARN) & SVC_I3C_MERRWARN_NACK) { + /* + * According to I3C Spec 1.1.1, 11-Jun-2021, section: 5.1.2.2.3. + * If the Controller chooses to start an I3C Message with an I3C Dynamic + * Address, then special provisions shall be made because that same I3C + * Target may be initiating an IBI or a Controller Role Request. So, one of + * three things may happen: (skip 1, 2) + * + * 3. The Addresses match and the RnW bits also match, and so neither + * Controller nor Target will ACK since both are expecting the other side to + * provide ACK. As a result, each side might think it had "won" arbitration, + * but neither side would continue, as each would subsequently see that the + * other did not provide ACK. + * ... + * For either value of RnW: Due to the NACK, the Controller shall defer the + * Private Write or Private Read, and should typically transmit the Target + * Address again after a Repeated START (i.e., the next one or any one prior + * to a STOP in the Frame). Since the Address Header following a Repeated + * START is not arbitrated, the Controller will always win (see Section + * 5.1.2.2.4). + */ + if (retry && addr != 0x7e) { + writel(SVC_I3C_MERRWARN_NACK, master->regs + SVC_I3C_MERRWARN); + } else { + ret = -ENXIO; + *actual_len = 0; + goto emit_stop; + } + } else { + break; + } } /* -- cgit From be90ae1ba14a83962b33c4d4c854ef081186b0e4 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 14 Jun 2024 17:02:08 +0300 Subject: i3c: mipi-i3c-hci: Fix number of DAT/DCT entries for HCI versions < 1.1 I was wrong about the TABLE_SIZE field description in the commit 0676bfebf576 ("i3c: mipi-i3c-hci: Fix DAT/DCT entry sizes"). For the MIPI I3C HCI versions 1.0 and earlier the TABLE_SIZE field in the registers DAT_SECTION_OFFSET and DCT_SECTION_OFFSET is indeed defined in DWORDs and not number of entries like it is defined in later versions. Where above fix allowed driver initialization to continue the wrongly interpreted TABLE_SIZE field leads variables DAT_entries being twice and DCT_entries four times as big as they really are. That in turn leads clearing the DAT table over the boundary in the dat_v1.c: hci_dat_v1_init(). So interprete the TABLE_SIZE field in DWORDs for HCI versions < 1.1 and fix number of DAT/DCT entries accordingly. Fixes: 0676bfebf576 ("i3c: mipi-i3c-hci: Fix DAT/DCT entry sizes") Signed-off-by: Jarkko Nikula Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index d7e966a25583..4e7d6a43ee9b 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -631,6 +631,7 @@ static irqreturn_t i3c_hci_irq_handler(int irq, void *dev_id) static int i3c_hci_init(struct i3c_hci *hci) { u32 regval, offset; + bool size_in_dwords; int ret; /* Validate HCI hardware version */ @@ -654,11 +655,16 @@ static int i3c_hci_init(struct i3c_hci *hci) hci->caps = reg_read(HC_CAPABILITIES); DBG("caps = %#x", hci->caps); + size_in_dwords = hci->version_major < 1 || + (hci->version_major == 1 && hci->version_minor < 1); + regval = reg_read(DAT_SECTION); offset = FIELD_GET(DAT_TABLE_OFFSET, regval); hci->DAT_regs = offset ? hci->base_regs + offset : NULL; hci->DAT_entries = FIELD_GET(DAT_TABLE_SIZE, regval); hci->DAT_entry_size = FIELD_GET(DAT_ENTRY_SIZE, regval) ? 0 : 8; + if (size_in_dwords) + hci->DAT_entries = 4 * hci->DAT_entries / hci->DAT_entry_size; dev_info(&hci->master.dev, "DAT: %u %u-bytes entries at offset %#x\n", hci->DAT_entries, hci->DAT_entry_size, offset); @@ -667,6 +673,8 @@ static int i3c_hci_init(struct i3c_hci *hci) hci->DCT_regs = offset ? hci->base_regs + offset : NULL; hci->DCT_entries = FIELD_GET(DCT_TABLE_SIZE, regval); hci->DCT_entry_size = FIELD_GET(DCT_ENTRY_SIZE, regval) ? 0 : 16; + if (size_in_dwords) + hci->DCT_entries = 4 * hci->DCT_entries / hci->DCT_entry_size; dev_info(&hci->master.dev, "DCT: %u %u-bytes entries at offset %#x\n", hci->DCT_entries, hci->DCT_entry_size, offset); -- cgit From 8f2cb0327938c786cede21a542fb538ff243a03a Mon Sep 17 00:00:00 2001 From: Aniket Date: Tue, 11 Jun 2024 11:26:50 +0000 Subject: i3c: dw: Fix clearing queue thld QUEUE_THLD_CTRL_IBI_STAT_MASK is repeated twice. Replace with QUEUE_THLD_CTRL_IBI_DATA_MASK. Signed-off-by: Aniket Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 0ec00e644bd4..0ba4960aa77b 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -631,7 +631,7 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) thld_ctrl = readl(master->regs + QUEUE_THLD_CTRL); thld_ctrl &= ~(QUEUE_THLD_CTRL_RESP_BUF_MASK | QUEUE_THLD_CTRL_IBI_STAT_MASK | - QUEUE_THLD_CTRL_IBI_STAT_MASK); + QUEUE_THLD_CTRL_IBI_DATA_MASK); thld_ctrl |= QUEUE_THLD_CTRL_IBI_STAT(1) | QUEUE_THLD_CTRL_IBI_DATA(31); writel(thld_ctrl, master->regs + QUEUE_THLD_CTRL); -- cgit From 64bf1459080670f05b8bb9c0cd140befd6780b5e Mon Sep 17 00:00:00 2001 From: Aniket Date: Fri, 7 Jun 2024 07:20:30 +0000 Subject: i3c: dw: Fix IBI intr programming IBI_SIR_REQ_REJECT register is not present if the IP has IC_HAS_IBI_DATA = 1 set. So don't rely on doing read- modify-write op on this register. Instead maintain a variable to store the sir reject mask and use it to set IBI_SIR_REQ_REJECT. Signed-off-by: Aniket Reviewed-by: Jeremy Kerr Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 15 ++++++++------- drivers/i3c/master/dw-i3c-master.h | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 0ba4960aa77b..77a2a1c3fd1d 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -658,7 +658,9 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) if (ret) return ret; - writel(IBI_REQ_REJECT_ALL, master->regs + IBI_SIR_REQ_REJECT); + master->sir_rej_mask = IBI_REQ_REJECT_ALL; + writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT); + writel(IBI_REQ_REJECT_ALL, master->regs + IBI_MR_REQ_REJECT); /* For now don't support Hot-Join */ @@ -1175,17 +1177,16 @@ static void dw_i3c_master_set_sir_enabled(struct dw_i3c_master *master, master->platform_ops->set_dat_ibi(master, dev, enable, ®); writel(reg, master->regs + dat_entry); - reg = readl(master->regs + IBI_SIR_REQ_REJECT); if (enable) { - global = reg == 0xffffffff; - reg &= ~BIT(idx); + global = (master->sir_rej_mask == IBI_REQ_REJECT_ALL); + master->sir_rej_mask &= ~BIT(idx); } else { bool hj_rejected = !!(readl(master->regs + DEVICE_CTRL) & DEV_CTRL_HOT_JOIN_NACK); - reg |= BIT(idx); - global = (reg == 0xffffffff) && hj_rejected; + master->sir_rej_mask |= BIT(idx); + global = (master->sir_rej_mask == IBI_REQ_REJECT_ALL) && hj_rejected; } - writel(reg, master->regs + IBI_SIR_REQ_REJECT); + writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT); if (global) dw_i3c_master_enable_sir_signal(master, enable); diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h index 4ab94aa72252..8cb617b8147e 100644 --- a/drivers/i3c/master/dw-i3c-master.h +++ b/drivers/i3c/master/dw-i3c-master.h @@ -39,7 +39,7 @@ struct dw_i3c_master { char version[5]; char type[5]; bool ibi_capable; - + u32 sir_rej_mask; /* * Per-device hardware data, used to manage the device address table * (DAT) -- cgit From d9deb28f700cbe15800f1835da7994a5c6f03928 Mon Sep 17 00:00:00 2001 From: Aniket Date: Thu, 27 Jun 2024 03:41:19 +0000 Subject: i3c: dw: Remove ibi_capable property Since DW I3C IP master role always supports IBI, we don't need to keep two variants of master ops and select one using this property. Hence remove the code. Signed-off-by: Aniket Link: https://lore.kernel.org/r/20240627034119.3938050-1-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/ast2600-i3c-master.c | 1 - drivers/i3c/master/dw-i3c-master.c | 23 ++--------------------- drivers/i3c/master/dw-i3c-master.h | 1 - 3 files changed, 2 insertions(+), 23 deletions(-) diff --git a/drivers/i3c/master/ast2600-i3c-master.c b/drivers/i3c/master/ast2600-i3c-master.c index 01a47d3dd499..84942dbb6f80 100644 --- a/drivers/i3c/master/ast2600-i3c-master.c +++ b/drivers/i3c/master/ast2600-i3c-master.c @@ -156,7 +156,6 @@ static int ast2600_i3c_probe(struct platform_device *pdev) i3c->sda_pullup); i3c->dw.platform_ops = &ast2600_i3c_ops; - i3c->dw.ibi_capable = true; return dw_i3c_common_probe(&i3c->dw, pdev); } diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 77a2a1c3fd1d..9aae5c8dba8d 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -1404,21 +1404,6 @@ static const struct i3c_master_controller_ops dw_mipi_i3c_ops = { .attach_i2c_dev = dw_i3c_master_attach_i2c_dev, .detach_i2c_dev = dw_i3c_master_detach_i2c_dev, .i2c_xfers = dw_i3c_master_i2c_xfers, -}; - -static const struct i3c_master_controller_ops dw_mipi_i3c_ibi_ops = { - .bus_init = dw_i3c_master_bus_init, - .bus_cleanup = dw_i3c_master_bus_cleanup, - .attach_i3c_dev = dw_i3c_master_attach_i3c_dev, - .reattach_i3c_dev = dw_i3c_master_reattach_i3c_dev, - .detach_i3c_dev = dw_i3c_master_detach_i3c_dev, - .do_daa = dw_i3c_master_daa, - .supports_ccc_cmd = dw_i3c_master_supports_ccc_cmd, - .send_ccc_cmd = dw_i3c_master_send_ccc_cmd, - .priv_xfers = dw_i3c_master_priv_xfers, - .attach_i2c_dev = dw_i3c_master_attach_i2c_dev, - .detach_i2c_dev = dw_i3c_master_detach_i2c_dev, - .i2c_xfers = dw_i3c_master_i2c_xfers, .request_ibi = dw_i3c_master_request_ibi, .free_ibi = dw_i3c_master_free_ibi, .enable_ibi = dw_i3c_master_enable_ibi, @@ -1456,7 +1441,6 @@ static void dw_i3c_hj_work(struct work_struct *work) int dw_i3c_common_probe(struct dw_i3c_master *master, struct platform_device *pdev) { - const struct i3c_master_controller_ops *ops; int ret, irq; if (!master->platform_ops) @@ -1506,12 +1490,9 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, master->maxdevs = ret >> 16; master->free_pos = GENMASK(master->maxdevs - 1, 0); - ops = &dw_mipi_i3c_ops; - if (master->ibi_capable) - ops = &dw_mipi_i3c_ibi_ops; - INIT_WORK(&master->hj_work, dw_i3c_hj_work); - ret = i3c_master_register(&master->base, &pdev->dev, ops, false); + ret = i3c_master_register(&master->base, &pdev->dev, + &dw_mipi_i3c_ops, false); if (ret) goto err_assert_rst; diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h index 8cb617b8147e..7e76ca381d9f 100644 --- a/drivers/i3c/master/dw-i3c-master.h +++ b/drivers/i3c/master/dw-i3c-master.h @@ -38,7 +38,6 @@ struct dw_i3c_master { struct clk *core_clk; char version[5]; char type[5]; - bool ibi_capable; u32 sir_rej_mask; /* * Per-device hardware data, used to manage the device address table -- cgit From 74e931f090a1d1a71d460284e87bd668667b98db Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 28 Jun 2024 16:15:56 +0300 Subject: i3c: mipi-i3c-hci: Switch to lower_32_bits()/upper_32_bits() helpers Rather than having own lo32()/hi32() helpers for dealing with 32-bit and 64-bit build targets switch to generic lower_32_bits()/upper_32_bits() helpers. Signed-off-by: Jarkko Nikula Link: https://lore.kernel.org/r/20240628131559.502822-1-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 4e01a95cc4d0..6c5964e727b3 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -147,21 +147,6 @@ struct hci_dma_dev_ibi_data { unsigned int max_len; }; -static inline u32 lo32(dma_addr_t physaddr) -{ - return physaddr; -} - -static inline u32 hi32(dma_addr_t physaddr) -{ - /* trickery to avoid compiler warnings on 32-bit build targets */ - if (sizeof(dma_addr_t) > 4) { - u64 hi = physaddr; - return hi >> 32; - } - return 0; -} - static void hci_dma_cleanup(struct i3c_hci *hci) { struct hci_rings_data *rings = hci->io_data; @@ -265,10 +250,10 @@ static int hci_dma_init(struct i3c_hci *hci) if (!rh->xfer || !rh->resp || !rh->src_xfers) goto err_out; - rh_reg_write(CMD_RING_BASE_LO, lo32(rh->xfer_dma)); - rh_reg_write(CMD_RING_BASE_HI, hi32(rh->xfer_dma)); - rh_reg_write(RESP_RING_BASE_LO, lo32(rh->resp_dma)); - rh_reg_write(RESP_RING_BASE_HI, hi32(rh->resp_dma)); + rh_reg_write(CMD_RING_BASE_LO, lower_32_bits(rh->xfer_dma)); + rh_reg_write(CMD_RING_BASE_HI, upper_32_bits(rh->xfer_dma)); + rh_reg_write(RESP_RING_BASE_LO, lower_32_bits(rh->resp_dma)); + rh_reg_write(RESP_RING_BASE_HI, upper_32_bits(rh->resp_dma)); regval = FIELD_PREP(CR_RING_SIZE, rh->xfer_entries); rh_reg_write(CR_SETUP, regval); @@ -404,8 +389,8 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, hci_dma_unmap_xfer(hci, xfer_list, i); return -ENOMEM; } - *ring_data++ = lo32(xfer->data_dma); - *ring_data++ = hi32(xfer->data_dma); + *ring_data++ = lower_32_bits(xfer->data_dma); + *ring_data++ = upper_32_bits(xfer->data_dma); } else { *ring_data++ = 0; *ring_data++ = 0; -- cgit From 2df1de813a5ccbc89c7c831365e9e2c479aea0c3 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 28 Jun 2024 16:15:57 +0300 Subject: i3c: mipi-i3c-hci: Set IBI Status and Data Ring base addresses IBI Status and Data Ring base address registers are not set so HW obviously cannot update those rings after In-Band Interrupt. Set them to already allocated and mapped ring addresses. Signed-off-by: Jarkko Nikula Link: https://lore.kernel.org/r/20240628131559.502822-2-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 6c5964e727b3..7a56ae4a5ddf 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -300,6 +300,11 @@ static int hci_dma_init(struct i3c_hci *hci) goto err_out; } + rh_reg_write(IBI_STATUS_RING_BASE_LO, lower_32_bits(rh->ibi_status_dma)); + rh_reg_write(IBI_STATUS_RING_BASE_HI, upper_32_bits(rh->ibi_status_dma)); + rh_reg_write(IBI_DATA_RING_BASE_LO, lower_32_bits(rh->ibi_data_dma)); + rh_reg_write(IBI_DATA_RING_BASE_HI, upper_32_bits(rh->ibi_data_dma)); + regval = FIELD_PREP(IBI_STATUS_RING_SIZE, rh->ibi_status_entries) | FIELD_PREP(IBI_DATA_CHUNK_SIZE, -- cgit From 8a2be2f1db268ec735419e53ef04ca039fc027dc Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 28 Jun 2024 16:15:58 +0300 Subject: i3c: mipi-i3c-hci: Error out instead on BUG_ON() in IBI DMA setup Definitely condition dma_get_cache_alignment * defined value > 256 during driver initialization is not reason to BUG_ON(). Turn that to graceful error out with -EINVAL. Signed-off-by: Jarkko Nikula Link: https://lore.kernel.org/r/20240628131559.502822-3-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 7a56ae4a5ddf..9bdfe40bc1e1 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -279,7 +279,10 @@ static int hci_dma_init(struct i3c_hci *hci) rh->ibi_chunk_sz = dma_get_cache_alignment(); rh->ibi_chunk_sz *= IBI_CHUNK_CACHELINES; - BUG_ON(rh->ibi_chunk_sz > 256); + if (rh->ibi_chunk_sz > 256) { + ret = -EINVAL; + goto err_out; + } ibi_status_ring_sz = rh->ibi_status_sz * rh->ibi_status_entries; ibi_data_ring_sz = rh->ibi_chunk_sz * rh->ibi_chunks_total; -- cgit From 4642f7eddb75efb5944ff21e5340ccce66928ff1 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 28 Jun 2024 16:15:59 +0300 Subject: i3c: mipi-i3c-hci: Round IBI data chunk size to HW supported value The dma.c: hci_dma_init() sets the CHUNK_SIZE field in the IBI_SETUP register incorrectly if the calculated ibi_chunk_sz is not exactly 2^(n+2) bytes, where n is 0..6. Fix this by rounding the chunk size up to nearest 2^(n+2) bytes. Signed-off-by: Jarkko Nikula Link: https://lore.kernel.org/r/20240628131559.502822-4-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/mipi-i3c-hci/dma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 9bdfe40bc1e1..a918e96b21fd 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -279,6 +279,13 @@ static int hci_dma_init(struct i3c_hci *hci) rh->ibi_chunk_sz = dma_get_cache_alignment(); rh->ibi_chunk_sz *= IBI_CHUNK_CACHELINES; + /* + * Round IBI data chunk size to number of bytes supported by + * the HW. Chunk size can be 2^n number of DWORDs which is the + * same as 2^(n+2) bytes, where n is 0..6. + */ + rh->ibi_chunk_sz = umax(4, rh->ibi_chunk_sz); + rh->ibi_chunk_sz = roundup_pow_of_two(rh->ibi_chunk_sz); if (rh->ibi_chunk_sz > 256) { ret = -EINVAL; goto err_out; -- cgit From 48a6dcdafdbe7b9596ac6ecbecc347c2e2413f81 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 2 Jul 2024 10:47:58 +0800 Subject: i3c: master: svc: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Chen Ni Link: https://lore.kernel.org/r/20240702024758.1411569-1-nichen@iscas.ac.cn Signed-off-by: Alexandre Belloni --- drivers/i3c/master/svc-i3c-master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index f0362509319e..eb42280f465a 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -1351,7 +1351,7 @@ static int svc_i3c_master_send_direct_ccc_cmd(struct svc_i3c_master *master, cmd->addr = ccc->dests[0].addr; cmd->rnw = ccc->rnw; cmd->in = ccc->rnw ? ccc->dests[0].payload.data : NULL; - cmd->out = ccc->rnw ? NULL : ccc->dests[0].payload.data, + cmd->out = ccc->rnw ? NULL : ccc->dests[0].payload.data; cmd->len = xfer_len; cmd->actual_len = actual_len; cmd->continued = false; -- cgit From 71134c13ac59fcadbfe39b8f9022a37266c4ed5a Mon Sep 17 00:00:00 2001 From: Aniket Date: Fri, 28 Jun 2024 15:45:26 +0000 Subject: dt-bindings: i3c: dw: Add apb clock binding Add dt binding for optional apb clock. Core clock is mandatory. Signed-off-by: Aniket Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240628154526.324068-1-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml index c0e805e531be..4fc13e3c0f75 100644 --- a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml +++ b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml @@ -20,7 +20,16 @@ properties: maxItems: 1 clocks: - maxItems: 1 + minItems: 1 + items: + - description: Core clock + - description: APB clock + + clock-names: + minItems: 1 + items: + - const: core + - const: apb interrupts: maxItems: 1 -- cgit From 54f5079e0dfcce2886178382ee9f03f35cc28325 Mon Sep 17 00:00:00 2001 From: Aniket Date: Fri, 28 Jun 2024 15:46:03 +0000 Subject: i3c: dw: Use new *_enabled clk API Move to "enabled" variant of clk_get API. It takes care of enable and disable calls during the probe and remove. Signed-off-by: Aniket Link: https://lore.kernel.org/r/20240628154603.326075-1-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 9aae5c8dba8d..4e3335641dcd 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -1450,7 +1450,7 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, if (IS_ERR(master->regs)) return PTR_ERR(master->regs); - master->core_clk = devm_clk_get(&pdev->dev, NULL); + master->core_clk = devm_clk_get_enabled(&pdev->dev, NULL); if (IS_ERR(master->core_clk)) return PTR_ERR(master->core_clk); @@ -1459,10 +1459,6 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, if (IS_ERR(master->core_rst)) return PTR_ERR(master->core_rst); - ret = clk_prepare_enable(master->core_clk); - if (ret) - goto err_disable_core_clk; - reset_control_deassert(master->core_rst); spin_lock_init(&master->xferqueue.lock); @@ -1501,9 +1497,6 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, err_assert_rst: reset_control_assert(master->core_rst); -err_disable_core_clk: - clk_disable_unprepare(master->core_clk); - return ret; } EXPORT_SYMBOL_GPL(dw_i3c_common_probe); @@ -1513,8 +1506,6 @@ void dw_i3c_common_remove(struct dw_i3c_master *master) i3c_master_unregister(&master->base); reset_control_assert(master->core_rst); - - clk_disable_unprepare(master->core_clk); } EXPORT_SYMBOL_GPL(dw_i3c_common_remove); -- cgit From a0d48ebf39cee74f3832d42126b6ddf4497fc07c Mon Sep 17 00:00:00 2001 From: Aniket Date: Fri, 28 Jun 2024 15:46:18 +0000 Subject: i3c: dw: Add optional apb clock Besides the core clock, IP also has an apb interface clock. Add an optional hook for the same. Signed-off-by: Aniket Link: https://lore.kernel.org/r/20240628154618.327151-1-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 4 ++++ drivers/i3c/master/dw-i3c-master.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 4e3335641dcd..0ca41782f3a6 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -1454,6 +1454,10 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, if (IS_ERR(master->core_clk)) return PTR_ERR(master->core_clk); + master->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); + if (IS_ERR(master->pclk)) + return PTR_ERR(master->pclk); + master->core_rst = devm_reset_control_get_optional_exclusive(&pdev->dev, "core_rst"); if (IS_ERR(master->core_rst)) diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h index 7e76ca381d9f..fb7121c6c687 100644 --- a/drivers/i3c/master/dw-i3c-master.h +++ b/drivers/i3c/master/dw-i3c-master.h @@ -36,6 +36,7 @@ struct dw_i3c_master { void __iomem *regs; struct reset_control *core_rst; struct clk *core_clk; + struct clk *pclk; char version[5]; char type[5]; u32 sir_rej_mask; -- cgit From 915d0741e0c8e5fe406f8a8369745a73e2c681ef Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 2 Jul 2024 18:31:07 -0400 Subject: i3c: master: svc: Improve DAA STOP handle code logic The REQUEST_PROC_DAA command behaves differently from other commands. Sometimes the hardware can auto emit STOP, but in other conditions, it cannot. Improves the code logic to better handle these situations. Hardware can auto emit STOP only when the following conditions are met: - The previous I3C device correctly returns a PID and ACKs an I3C address. - A NACK is received when emitting 7E to try to get the next I3C device's PID. In all other cases, a manual STOP emission is needed. The code is changed to emit STOP when break the while loop and 'return 0' only when the hardware can auto emit STOP. Signed-off-by: Frank Li Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20240702223107.403057-1-Frank.Li@nxp.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/svc-i3c-master.c | 61 ++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index eb42280f465a..e80c002991f7 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -790,7 +790,20 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, int ret, i; while (true) { - /* Enter/proceed with DAA */ + /* SVC_I3C_MCTRL_REQUEST_PROC_DAA have two mode, ENTER DAA or PROCESS DAA. + * + * ENTER DAA: + * 1 will issue START, 7E, ENTDAA, and then emits 7E/R to process first target. + * 2 Stops just before the new Dynamic Address (DA) is to be emitted. + * + * PROCESS DAA: + * 1 The DA is written using MWDATAB or ADDR bits 6:0. + * 2 ProcessDAA is requested again to write the new address, and then starts the + * next (START, 7E, ENTDAA) unless marked to STOP; an MSTATUS indicating NACK + * means DA was not accepted (e.g. parity error). If PROCESSDAA is NACKed on the + * 7E/R, which means no more Slaves need a DA, then a COMPLETE will be signaled + * (along with DONE), and a STOP issued automatically. + */ writel(SVC_I3C_MCTRL_REQUEST_PROC_DAA | SVC_I3C_MCTRL_TYPE_I3C | SVC_I3C_MCTRL_IBIRESP_NACK | @@ -807,7 +820,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, SVC_I3C_MSTATUS_MCTRLDONE(reg), 1, 1000); if (ret) - return ret; + break; if (SVC_I3C_MSTATUS_RXPEND(reg)) { u8 data[6]; @@ -819,7 +832,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, */ ret = svc_i3c_master_readb(master, data, 6); if (ret) - return ret; + break; for (i = 0; i < 6; i++) prov_id[dev_nb] |= (u64)(data[i]) << (8 * (5 - i)); @@ -827,7 +840,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, /* We do not care about the BCR and DCR yet */ ret = svc_i3c_master_readb(master, data, 2); if (ret) - return ret; + break; } else if (SVC_I3C_MSTATUS_MCTRLDONE(reg)) { if (SVC_I3C_MSTATUS_STATE_IDLE(reg) && SVC_I3C_MSTATUS_COMPLETE(reg)) { @@ -835,12 +848,23 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, * All devices received and acked they dynamic * address, this is the natural end of the DAA * procedure. + * + * Hardware will auto emit STOP at this case. */ - break; + *count = dev_nb; + return 0; + } else if (SVC_I3C_MSTATUS_NACKED(reg)) { /* No I3C devices attached */ - if (dev_nb == 0) + if (dev_nb == 0) { + /* + * Hardware can't treat first NACK for ENTAA as normal + * COMPLETE. So need manual emit STOP. + */ + ret = 0; + *count = 0; break; + } /* * A slave device nacked the address, this is @@ -849,8 +873,10 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, * answer again immediately and shall ack the * address this time. */ - if (prov_id[dev_nb] == nacking_prov_id) - return -EIO; + if (prov_id[dev_nb] == nacking_prov_id) { + ret = EIO; + break; + } dev_nb--; nacking_prov_id = prov_id[dev_nb]; @@ -858,7 +884,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, continue; } else { - return -EIO; + break; } } @@ -870,12 +896,12 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, SVC_I3C_MSTATUS_BETWEEN(reg), 0, 1000); if (ret) - return ret; + break; /* Give the slave device a suitable dynamic address */ ret = i3c_master_get_free_addr(&master->base, last_addr + 1); if (ret < 0) - return ret; + break; addrs[dev_nb] = ret; dev_dbg(master->dev, "DAA: device %d assigned to 0x%02x\n", @@ -885,9 +911,9 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, last_addr = addrs[dev_nb++]; } - *count = dev_nb; - - return 0; + /* Need manual issue STOP except for Complete condition */ + svc_i3c_master_emit_stop(master); + return ret; } static int svc_i3c_update_ibirules(struct svc_i3c_master *master) @@ -961,11 +987,10 @@ static int svc_i3c_master_do_daa(struct i3c_master_controller *m) spin_lock_irqsave(&master->xferqueue.lock, flags); ret = svc_i3c_master_do_daa_locked(master, addrs, &dev_nb); spin_unlock_irqrestore(&master->xferqueue.lock, flags); - if (ret) { - svc_i3c_master_emit_stop(master); - svc_i3c_master_clear_merrwarn(master); + + svc_i3c_master_clear_merrwarn(master); + if (ret) goto rpm_out; - } /* Register all devices who participated to the core */ for (i = 0; i < dev_nb; i++) { -- cgit From 4e89bc48e66a9aef46a705943d3f81bffb3a2288 Mon Sep 17 00:00:00 2001 From: Aniket Date: Mon, 8 Jul 2024 06:21:01 +0000 Subject: i3c: dw: Save timing registers and other values Add variables to store timing registers and other values. These variables would be later used to restore registers during resume without recomputation. Signed-off-by: Aniket Link: https://lore.kernel.org/r/20240708062103.3296587-2-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 15 +++++++++++++-- drivers/i3c/master/dw-i3c-master.h | 9 +++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 0ca41782f3a6..fcfa37f55d86 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -543,18 +543,22 @@ static int dw_i3c_clk_cfg(struct dw_i3c_master *master) scl_timing = SCL_I3C_TIMING_HCNT(hcnt) | SCL_I3C_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I3C_PP_TIMING); + master->i3c_pp_timing = scl_timing; /* * In pure i3c mode, MST_FREE represents tCAS. In shared mode, this * will be set up by dw_i2c_clk_cfg as tLOW. */ - if (master->base.bus.mode == I3C_BUS_MODE_PURE) + if (master->base.bus.mode == I3C_BUS_MODE_PURE) { writel(BUS_I3C_MST_FREE(lcnt), master->regs + BUS_FREE_TIMING); + master->bus_free_timing = BUS_I3C_MST_FREE(lcnt); + } lcnt = max_t(u8, DIV_ROUND_UP(I3C_BUS_TLOW_OD_MIN_NS, core_period), lcnt); scl_timing = SCL_I3C_TIMING_HCNT(hcnt) | SCL_I3C_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I3C_OD_TIMING); + master->i3c_od_timing = scl_timing; lcnt = DIV_ROUND_UP(core_rate, I3C_BUS_SDR1_SCL_RATE) - hcnt; scl_timing = SCL_EXT_LCNT_1(lcnt); @@ -565,6 +569,7 @@ static int dw_i3c_clk_cfg(struct dw_i3c_master *master) lcnt = DIV_ROUND_UP(core_rate, I3C_BUS_SDR4_SCL_RATE) - hcnt; scl_timing |= SCL_EXT_LCNT_4(lcnt); writel(scl_timing, master->regs + SCL_EXT_LCNT_TIMING); + master->ext_lcnt_timing = scl_timing; return 0; } @@ -586,16 +591,21 @@ static int dw_i2c_clk_cfg(struct dw_i3c_master *master) scl_timing = SCL_I2C_FMP_TIMING_HCNT(hcnt) | SCL_I2C_FMP_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I2C_FMP_TIMING); + master->i2c_fmp_timing = scl_timing; lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FM_TLOW_MIN_NS, core_period); hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_RATE) - lcnt; scl_timing = SCL_I2C_FM_TIMING_HCNT(hcnt) | SCL_I2C_FM_TIMING_LCNT(lcnt); writel(scl_timing, master->regs + SCL_I2C_FM_TIMING); + master->i2c_fm_timing = scl_timing; writel(BUS_I3C_MST_FREE(lcnt), master->regs + BUS_FREE_TIMING); + master->bus_free_timing = BUS_I3C_MST_FREE(lcnt); + writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_I2C_SLAVE_PRESENT, master->regs + DEVICE_CTRL); + master->i2c_slv_prsnt = true; return 0; } @@ -650,7 +660,7 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) writel(DEV_ADDR_DYNAMIC_ADDR_VALID | DEV_ADDR_DYNAMIC(ret), master->regs + DEVICE_ADDR); - + master->dev_addr = ret; memset(&info, 0, sizeof(info)); info.dyn_addr = ret; @@ -1077,6 +1087,7 @@ static int dw_i3c_master_attach_i2c_dev(struct i2c_dev_desc *dev) data->index = pos; master->devs[pos].addr = dev->addr; + master->devs[pos].is_i2c_addr = true; master->free_pos &= ~BIT(pos); i2c_dev_set_master_data(dev, data); diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h index fb7121c6c687..f23e9d5aca86 100644 --- a/drivers/i3c/master/dw-i3c-master.h +++ b/drivers/i3c/master/dw-i3c-master.h @@ -19,6 +19,7 @@ struct dw_i3c_master_caps { struct dw_i3c_dat_entry { u8 addr; + bool is_i2c_addr; struct i3c_dev_desc *ibi_dev; }; @@ -40,6 +41,14 @@ struct dw_i3c_master { char version[5]; char type[5]; u32 sir_rej_mask; + bool i2c_slv_prsnt; + u32 dev_addr; + u32 i3c_pp_timing; + u32 i3c_od_timing; + u32 ext_lcnt_timing; + u32 bus_free_timing; + u32 i2c_fm_timing; + u32 i2c_fmp_timing; /* * Per-device hardware data, used to manage the device address table * (DAT) -- cgit From effd21743c39a6535f204812006054fa172e4313 Mon Sep 17 00:00:00 2001 From: Aniket Date: Mon, 8 Jul 2024 06:21:02 +0000 Subject: i3c: dw: Add some functions for reusability Separate logic for setting intr/thld registers in a func. Also modify enable function to take care of setting all fields in DEVICE_CTRL. These functions can be reused later for power management. Signed-off-by: Aniket Link: https://lore.kernel.org/r/20240708062103.3296587-3-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 62 +++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index fcfa37f55d86..fadb58ac8b68 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -300,7 +300,14 @@ static void dw_i3c_master_disable(struct dw_i3c_master *master) static void dw_i3c_master_enable(struct dw_i3c_master *master) { - writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_ENABLE, + u32 dev_ctrl; + + dev_ctrl = readl(master->regs + DEVICE_CTRL); + /* For now don't support Hot-Join */ + dev_ctrl |= DEV_CTRL_HOT_JOIN_NACK; + if (master->i2c_slv_prsnt) + dev_ctrl |= DEV_CTRL_I2C_SLAVE_PRESENT; + writel(dev_ctrl | DEV_CTRL_ENABLE, master->regs + DEVICE_CTRL); } @@ -521,6 +528,32 @@ static void dw_i3c_master_end_xfer_locked(struct dw_i3c_master *master, u32 isr) dw_i3c_master_start_xfer_locked(master); } +static void dw_i3c_master_set_intr_regs(struct dw_i3c_master *master) +{ + u32 thld_ctrl; + + thld_ctrl = readl(master->regs + QUEUE_THLD_CTRL); + thld_ctrl &= ~(QUEUE_THLD_CTRL_RESP_BUF_MASK | + QUEUE_THLD_CTRL_IBI_STAT_MASK | + QUEUE_THLD_CTRL_IBI_DATA_MASK); + thld_ctrl |= QUEUE_THLD_CTRL_IBI_STAT(1) | + QUEUE_THLD_CTRL_IBI_DATA(31); + writel(thld_ctrl, master->regs + QUEUE_THLD_CTRL); + + thld_ctrl = readl(master->regs + DATA_BUFFER_THLD_CTRL); + thld_ctrl &= ~DATA_BUFFER_THLD_CTRL_RX_BUF; + writel(thld_ctrl, master->regs + DATA_BUFFER_THLD_CTRL); + + writel(INTR_ALL, master->regs + INTR_STATUS); + writel(INTR_MASTER_MASK, master->regs + INTR_STATUS_EN); + writel(INTR_MASTER_MASK, master->regs + INTR_SIGNAL_EN); + + master->sir_rej_mask = IBI_REQ_REJECT_ALL; + writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT); + + writel(IBI_REQ_REJECT_ALL, master->regs + IBI_MR_REQ_REJECT); +} + static int dw_i3c_clk_cfg(struct dw_i3c_master *master) { unsigned long core_rate, core_period; @@ -615,7 +648,6 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) struct dw_i3c_master *master = to_dw_i3c_master(m); struct i3c_bus *bus = i3c_master_get_bus(m); struct i3c_device_info info = { }; - u32 thld_ctrl; int ret; ret = master->platform_ops->init(master); @@ -638,22 +670,6 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) return -EINVAL; } - thld_ctrl = readl(master->regs + QUEUE_THLD_CTRL); - thld_ctrl &= ~(QUEUE_THLD_CTRL_RESP_BUF_MASK | - QUEUE_THLD_CTRL_IBI_STAT_MASK | - QUEUE_THLD_CTRL_IBI_DATA_MASK); - thld_ctrl |= QUEUE_THLD_CTRL_IBI_STAT(1) | - QUEUE_THLD_CTRL_IBI_DATA(31); - writel(thld_ctrl, master->regs + QUEUE_THLD_CTRL); - - thld_ctrl = readl(master->regs + DATA_BUFFER_THLD_CTRL); - thld_ctrl &= ~DATA_BUFFER_THLD_CTRL_RX_BUF; - writel(thld_ctrl, master->regs + DATA_BUFFER_THLD_CTRL); - - writel(INTR_ALL, master->regs + INTR_STATUS); - writel(INTR_MASTER_MASK, master->regs + INTR_STATUS_EN); - writel(INTR_MASTER_MASK, master->regs + INTR_SIGNAL_EN); - ret = i3c_master_get_free_addr(m, 0); if (ret < 0) return ret; @@ -668,15 +684,7 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) if (ret) return ret; - master->sir_rej_mask = IBI_REQ_REJECT_ALL; - writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT); - - writel(IBI_REQ_REJECT_ALL, master->regs + IBI_MR_REQ_REJECT); - - /* For now don't support Hot-Join */ - writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_HOT_JOIN_NACK, - master->regs + DEVICE_CTRL); - + dw_i3c_master_set_intr_regs(master); dw_i3c_master_enable(master); return 0; -- cgit From 62fe9d06f5709caa1fa68055873ff279e05ade37 Mon Sep 17 00:00:00 2001 From: Aniket Date: Mon, 8 Jul 2024 06:21:03 +0000 Subject: i3c: dw: Add power management support Add support for runtime and system power management. Handle clocks, resets, pads as part of suspend and resume. Restore controller registers that could be lost due to suspend. Finally add get and put calls appropriately in functions which access controller : bus_init, do_daa, send_ccc_cmd, priv_xfers, i2c_xfers, ibi and hot-join. Signed-off-by: Aniket Link: https://lore.kernel.org/r/20240708062103.3296587-4-aniketmaurya@google.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master/dw-i3c-master.c | 225 ++++++++++++++++++++++++++++++++++--- drivers/i3c/master/dw-i3c-master.h | 1 + 2 files changed, 209 insertions(+), 17 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index fadb58ac8b68..8d694672c110 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -17,7 +17,9 @@ #include #include #include +#include #include +#include #include #include @@ -217,7 +219,7 @@ #define I3C_BUS_THIGH_MAX_NS 41 #define XFER_TIMEOUT (msecs_to_jiffies(1000)) - +#define RPM_AUTOSUSPEND_TIMEOUT 1000 /* ms */ struct dw_i3c_cmd { u32 cmd_lo; u32 cmd_hi; @@ -650,29 +652,38 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) struct i3c_device_info info = { }; int ret; + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return ret; + } + ret = master->platform_ops->init(master); if (ret) - return ret; + goto rpm_out; switch (bus->mode) { case I3C_BUS_MODE_MIXED_FAST: case I3C_BUS_MODE_MIXED_LIMITED: ret = dw_i2c_clk_cfg(master); if (ret) - return ret; + goto rpm_out; fallthrough; case I3C_BUS_MODE_PURE: ret = dw_i3c_clk_cfg(master); if (ret) - return ret; + goto rpm_out; break; default: - return -EINVAL; + ret = -EINVAL; + goto rpm_out; } ret = i3c_master_get_free_addr(m, 0); if (ret < 0) - return ret; + goto rpm_out; writel(DEV_ADDR_DYNAMIC_ADDR_VALID | DEV_ADDR_DYNAMIC(ret), master->regs + DEVICE_ADDR); @@ -682,12 +693,15 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m) ret = i3c_master_set_info(&master->base, &info); if (ret) - return ret; + goto rpm_out; dw_i3c_master_set_intr_regs(master); dw_i3c_master_enable(master); - return 0; +rpm_out: + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); + return ret; } static void dw_i3c_master_bus_cleanup(struct i3c_master_controller *m) @@ -789,11 +803,21 @@ static int dw_i3c_master_send_ccc_cmd(struct i3c_master_controller *m, if (ccc->id == I3C_CCC_ENTDAA) return -EINVAL; + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return ret; + } + if (ccc->rnw) ret = dw_i3c_ccc_get(master, ccc); else ret = dw_i3c_ccc_set(master, ccc); + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); return ret; } @@ -806,6 +830,14 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m) u8 p, last_addr = 0; int ret, pos; + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return ret; + } + olddevs = ~(master->free_pos); /* Prepare DAT before launching DAA. */ @@ -814,8 +846,10 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m) continue; ret = i3c_master_get_free_addr(m, last_addr + 1); - if (ret < 0) - return -ENOSPC; + if (ret < 0) { + ret = -ENOSPC; + goto rpm_out; + } master->devs[pos].addr = ret; p = even_parity(ret); @@ -825,16 +859,21 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m) writel(DEV_ADDR_TABLE_DYNAMIC_ADDR(ret), master->regs + DEV_ADDR_TABLE_LOC(master->datstartaddr, pos)); + + ret = 0; } xfer = dw_i3c_master_alloc_xfer(master, 1); - if (!xfer) - return -ENOMEM; + if (!xfer) { + ret = -ENOMEM; + goto rpm_out; + } pos = dw_i3c_master_get_free_pos(master); if (pos < 0) { dw_i3c_master_free_xfer(xfer); - return pos; + ret = pos; + goto rpm_out; } cmd = &xfer->cmds[0]; cmd->cmd_hi = 0x1; @@ -859,7 +898,10 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m) dw_i3c_master_free_xfer(xfer); - return 0; +rpm_out: + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); + return ret; } static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, @@ -894,6 +936,14 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, if (!xfer) return -ENOMEM; + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return ret; + } + for (i = 0; i < i3c_nxfers; i++) { struct dw_i3c_cmd *cmd = &xfer->cmds[i]; @@ -935,6 +985,8 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, ret = xfer->ret; dw_i3c_master_free_xfer(xfer); + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); return ret; } @@ -1045,6 +1097,14 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, if (!xfer) return -ENOMEM; + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return ret; + } + for (i = 0; i < i2c_nxfers; i++) { struct dw_i3c_cmd *cmd = &xfer->cmds[i]; @@ -1075,6 +1135,8 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, ret = xfer->ret; dw_i3c_master_free_xfer(xfer); + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); return ret; } @@ -1217,6 +1279,15 @@ static void dw_i3c_master_set_sir_enabled(struct dw_i3c_master *master, static int dw_i3c_master_enable_hotjoin(struct i3c_master_controller *m) { struct dw_i3c_master *master = to_dw_i3c_master(m); + int ret; + + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return ret; + } dw_i3c_master_enable_sir_signal(master, true); writel(readl(master->regs + DEVICE_CTRL) & ~DEV_CTRL_HOT_JOIN_NACK, @@ -1232,6 +1303,8 @@ static int dw_i3c_master_disable_hotjoin(struct i3c_master_controller *m) writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_HOT_JOIN_NACK, master->regs + DEVICE_CTRL); + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); return 0; } @@ -1242,12 +1315,23 @@ static int dw_i3c_master_enable_ibi(struct i3c_dev_desc *dev) struct dw_i3c_master *master = to_dw_i3c_master(m); int rc; + rc = pm_runtime_resume_and_get(master->dev); + if (rc < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, rc); + return rc; + } + dw_i3c_master_set_sir_enabled(master, dev, data->index, true); rc = i3c_master_enec_locked(m, dev->info.dyn_addr, I3C_CCC_EVENT_SIR); - if (rc) + if (rc) { dw_i3c_master_set_sir_enabled(master, dev, data->index, false); + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); + } return rc; } @@ -1265,6 +1349,8 @@ static int dw_i3c_master_disable_ibi(struct i3c_dev_desc *dev) dw_i3c_master_set_sir_enabled(master, dev, data->index, false); + pm_runtime_mark_last_busy(master->dev); + pm_runtime_put_autosuspend(master->dev); return 0; } @@ -1465,6 +1551,8 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, if (!master->platform_ops) master->platform_ops = &dw_i3c_platform_ops_default; + master->dev = &pdev->dev; + master->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(master->regs)) return PTR_ERR(master->regs); @@ -1497,6 +1585,11 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, platform_set_drvdata(pdev, master); + pm_runtime_set_autosuspend_delay(&pdev->dev, RPM_AUTOSUSPEND_TIMEOUT); + pm_runtime_use_autosuspend(&pdev->dev); + pm_runtime_set_active(&pdev->dev); + pm_runtime_enable(&pdev->dev); + /* Information regarding the FIFOs/QUEUEs depth */ ret = readl(master->regs + QUEUE_STATUS_LEVEL); master->caps.cmdfifodepth = QUEUE_STATUS_LEVEL_CMD(ret); @@ -1513,10 +1606,15 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, ret = i3c_master_register(&master->base, &pdev->dev, &dw_mipi_i3c_ops, false); if (ret) - goto err_assert_rst; + goto err_disable_pm; return 0; +err_disable_pm: + pm_runtime_disable(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); + pm_runtime_dont_use_autosuspend(&pdev->dev); + err_assert_rst: reset_control_assert(master->core_rst); @@ -1528,7 +1626,9 @@ void dw_i3c_common_remove(struct dw_i3c_master *master) { i3c_master_unregister(&master->base); - reset_control_assert(master->core_rst); + pm_runtime_disable(master->dev); + pm_runtime_set_suspended(master->dev); + pm_runtime_dont_use_autosuspend(master->dev); } EXPORT_SYMBOL_GPL(dw_i3c_common_remove); @@ -1552,6 +1652,96 @@ static void dw_i3c_remove(struct platform_device *pdev) dw_i3c_common_remove(master); } +static void dw_i3c_master_restore_addrs(struct dw_i3c_master *master) +{ + u32 pos, reg_val; + + writel(DEV_ADDR_DYNAMIC_ADDR_VALID | DEV_ADDR_DYNAMIC(master->dev_addr), + master->regs + DEVICE_ADDR); + + for (pos = 0; pos < master->maxdevs; pos++) { + if (master->free_pos & BIT(pos)) + continue; + + if (master->devs[pos].is_i2c_addr) + reg_val = DEV_ADDR_TABLE_LEGACY_I2C_DEV | + DEV_ADDR_TABLE_STATIC_ADDR(master->devs[pos].addr); + else + reg_val = DEV_ADDR_TABLE_DYNAMIC_ADDR(master->devs[pos].addr); + + writel(reg_val, master->regs + DEV_ADDR_TABLE_LOC(master->datstartaddr, pos)); + } +} + +static void dw_i3c_master_restore_timing_regs(struct dw_i3c_master *master) +{ + writel(master->i3c_pp_timing, master->regs + SCL_I3C_PP_TIMING); + writel(master->bus_free_timing, master->regs + BUS_FREE_TIMING); + writel(master->i3c_od_timing, master->regs + SCL_I3C_OD_TIMING); + writel(master->ext_lcnt_timing, master->regs + SCL_EXT_LCNT_TIMING); + + if (master->i2c_slv_prsnt) { + writel(master->i2c_fmp_timing, master->regs + SCL_I2C_FMP_TIMING); + writel(master->i2c_fm_timing, master->regs + SCL_I2C_FM_TIMING); + } +} + +static int dw_i3c_master_enable_clks(struct dw_i3c_master *master) +{ + int ret = 0; + + ret = clk_prepare_enable(master->core_clk); + if (ret) + return ret; + + ret = clk_prepare_enable(master->pclk); + if (ret) { + clk_disable_unprepare(master->core_clk); + return ret; + } + + return 0; +} + +static inline void dw_i3c_master_disable_clks(struct dw_i3c_master *master) +{ + clk_disable_unprepare(master->pclk); + clk_disable_unprepare(master->core_clk); +} + +static int __maybe_unused dw_i3c_master_runtime_suspend(struct device *dev) +{ + struct dw_i3c_master *master = dev_get_drvdata(dev); + + dw_i3c_master_disable(master); + + reset_control_assert(master->core_rst); + dw_i3c_master_disable_clks(master); + pinctrl_pm_select_sleep_state(dev); + return 0; +} + +static int __maybe_unused dw_i3c_master_runtime_resume(struct device *dev) +{ + struct dw_i3c_master *master = dev_get_drvdata(dev); + + pinctrl_pm_select_default_state(dev); + dw_i3c_master_enable_clks(master); + reset_control_deassert(master->core_rst); + + dw_i3c_master_set_intr_regs(master); + dw_i3c_master_restore_timing_regs(master); + dw_i3c_master_restore_addrs(master); + + dw_i3c_master_enable(master); + return 0; +} + +static const struct dev_pm_ops dw_i3c_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + SET_RUNTIME_PM_OPS(dw_i3c_master_runtime_suspend, dw_i3c_master_runtime_resume, NULL) +}; + static const struct of_device_id dw_i3c_master_of_match[] = { { .compatible = "snps,dw-i3c-master-1.00a", }, {}, @@ -1564,6 +1754,7 @@ static struct platform_driver dw_i3c_driver = { .driver = { .name = "dw-i3c-master", .of_match_table = dw_i3c_master_of_match, + .pm = &dw_i3c_pm_ops, }, }; module_platform_driver(dw_i3c_driver); diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h index f23e9d5aca86..219ff815d3a7 100644 --- a/drivers/i3c/master/dw-i3c-master.h +++ b/drivers/i3c/master/dw-i3c-master.h @@ -25,6 +25,7 @@ struct dw_i3c_dat_entry { struct dw_i3c_master { struct i3c_master_controller base; + struct device *dev; u16 maxdevs; u16 datstartaddr; u32 free_pos; -- cgit From 63c33ca0969cf4d4574103b69fb46f58a19a182b Mon Sep 17 00:00:00 2001 From: Bhoomik Gupta Date: Mon, 8 Jul 2024 11:08:35 +0530 Subject: i3c: master: Enhance i3c_bus_type visibility for device searching & event monitoring Improve the visibility of i3c_bus_type to facilitate searching for i3c devices attached to the i3c bus. Enable other drivers to use bus_register_notifier to monitor i3c bus device events. Signed-off-by: Bhoomik Gupta Link: https://lore.kernel.org/r/20240708053835.3003986-1-bhoomik.gupta@nxp.com Signed-off-by: Alexandre Belloni --- drivers/i3c/internals.h | 2 -- drivers/i3c/master.c | 1 + include/linux/i3c/master.h | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h index 4d99a3524171..433f6088b7ce 100644 --- a/drivers/i3c/internals.h +++ b/drivers/i3c/internals.h @@ -10,8 +10,6 @@ #include -extern const struct bus_type i3c_bus_type; - void i3c_bus_normaluse_lock(struct i3c_bus *bus); void i3c_bus_normaluse_unlock(struct i3c_bus *bus); diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 3b4d6a8edca3..a211dc4d25bb 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -342,6 +342,7 @@ const struct bus_type i3c_bus_type = { .probe = i3c_device_probe, .remove = i3c_device_remove, }; +EXPORT_SYMBOL_GPL(i3c_bus_type); static enum i3c_addr_slot_status i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr) diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 0ca27dd86956..074f632868d9 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -33,6 +33,7 @@ enum { struct i3c_master_controller; struct i3c_bus; struct i3c_device; +extern const struct bus_type i3c_bus_type; /** * struct i3c_i2c_dev_desc - Common part of the I3C/I2C device descriptor -- cgit From b73c983491e0a61331611d2dab1dfccd4464b060 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 18 Jul 2024 13:30:22 -0500 Subject: i3c: master: svc: Fix error code in svc_i3c_master_do_daa_locked() This code has a typo so it returns positive EIO instead of negative -EIO. Fix it! Fixes: a7809cb368b9 ("i3c: master: svc: Improve DAA STOP handle code logic") Signed-off-by: Dan Carpenter Reviewed-by: Frank Li Link: https://lore.kernel.org/r/e017edfc-da64-496b-8516-958bec27cd9a@stanley.mountain Signed-off-by: Alexandre Belloni --- drivers/i3c/master/svc-i3c-master.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index e80c002991f7..0a68fd1b81d4 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -874,7 +874,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, * address this time. */ if (prov_id[dev_nb] == nacking_prov_id) { - ret = EIO; + ret = -EIO; break; } -- cgit From 24168c5e6dfbdd5b414f048f47f75d64533296ca Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Mon, 15 Jul 2024 18:53:51 -0400 Subject: dt-bindings: i3c: add header for generic I3C flags Add header file for generic I3C flags to avoid hard code in dts file. Signed-off-by: Carlos Song Reviewed-by: Frank Li Acked-by: Jason Liu Signed-off-by: Frank Li Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240715225351.3237284-1-Frank.Li@nxp.com Signed-off-by: Alexandre Belloni --- Documentation/devicetree/bindings/i3c/i3c.yaml | 5 ++++- MAINTAINERS | 1 + include/dt-bindings/i3c/i3c.h | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 include/dt-bindings/i3c/i3c.h diff --git a/Documentation/devicetree/bindings/i3c/i3c.yaml b/Documentation/devicetree/bindings/i3c/i3c.yaml index 113957ebe9f1..e25fa72fd785 100644 --- a/Documentation/devicetree/bindings/i3c/i3c.yaml +++ b/Documentation/devicetree/bindings/i3c/i3c.yaml @@ -91,6 +91,7 @@ patternProperties: - const: 0 - description: | Shall encode the I3C LVR (Legacy Virtual Register): + See include/dt-bindings/i3c/i3c.h bit[31:8]: unused/ignored bit[7:5]: I2C device index. Possible values: * 0: I2C device has a 50 ns spike filter @@ -153,6 +154,8 @@ additionalProperties: true examples: - | + #include + i3c@d040000 { compatible = "cdns,i3c-master"; clocks = <&coreclock>, <&i3csysclock>; @@ -166,7 +169,7 @@ examples: /* I2C device. */ eeprom@57 { compatible = "atmel,24c01"; - reg = <0x57 0x0 0x10>; + reg = <0x57 0x0 (I2C_FM | I2C_FILTER)>; pagesize = <0x8>; }; diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bf..8b27d7752b9b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10479,6 +10479,7 @@ F: Documentation/ABI/testing/sysfs-bus-i3c F: Documentation/devicetree/bindings/i3c/ F: Documentation/driver-api/i3c F: drivers/i3c/ +F: include/dt-bindings/i3c/ F: include/linux/i3c/ IBM Operation Panel Input Driver diff --git a/include/dt-bindings/i3c/i3c.h b/include/dt-bindings/i3c/i3c.h new file mode 100644 index 000000000000..373439218bba --- /dev/null +++ b/include/dt-bindings/i3c/i3c.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Copyright 2024 NXP + */ + +#ifndef _DT_BINDINGS_I3C_I3C_H +#define _DT_BINDINGS_I3C_I3C_H + +#define I2C_FM (1 << 4) +#define I2C_FM_PLUS (0 << 4) + +#define I2C_FILTER (0 << 5) +#define I2C_NO_FILTER_HIGH_FREQUENCY (1 << 5) +#define I2C_NO_FILTER_LOW_FREQUENCY (2 << 5) + +#endif -- cgit From 5d5fc33ce58e81e8738816f5ee59f8e85fd3b404 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 6 Jun 2024 23:13:35 -0700 Subject: riscv: Improve exception and system call latency Many CPUs implement return address branch prediction as a stack. The RISCV architecture refers to this as a return address stack (RAS). If this gets corrupted then the CPU will mispredict at least one but potentally many function returns. There are two issues with the current RISCV exception code: - We are using the alternate link stack (x5/t0) for the indirect branch which makes the hardware think this is a function return. This will corrupt the RAS. - We modify the return address of handle_exception to point to ret_from_exception. This will also corrupt the RAS. Testing the null system call latency before and after the patch: Visionfive2 (StarFive JH7110 / U74) baseline: 189.87 ns patched: 176.76 ns Lichee pi 4a (T-Head TH1520 / C910) baseline: 666.58 ns patched: 636.90 ns Just over 7% on the U74 and just over 4% on the C910. Signed-off-by: Anton Blanchard Signed-off-by: Cyril Bur Tested-by: Jisheng Zhang Reviewed-by: Jisheng Zhang Link: https://lore.kernel.org/r/20240607061335.2197383-1-cyrilbur@tenstorrent.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 17 ++++++++++------- arch/riscv/kernel/stacktrace.c | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 68a24cf9481a..c933460ed3e9 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -88,7 +88,6 @@ SYM_CODE_START(handle_exception) call riscv_v_context_nesting_start #endif move a0, sp /* pt_regs */ - la ra, ret_from_exception /* * MSB of cause differentiates between @@ -97,7 +96,8 @@ SYM_CODE_START(handle_exception) bge s4, zero, 1f /* Handle interrupts */ - tail do_irq + call do_irq + j ret_from_exception 1: /* Handle other exceptions */ slli t0, s4, RISCV_LGPTR @@ -105,11 +105,14 @@ SYM_CODE_START(handle_exception) la t2, excp_vect_table_end add t0, t1, t0 /* Check if exception code lies within bounds */ - bgeu t0, t2, 1f - REG_L t0, 0(t0) - jr t0 -1: - tail do_trap_unknown + bgeu t0, t2, 3f + REG_L t1, 0(t0) +2: jalr t1 + j ret_from_exception +3: + + la t1, do_trap_unknown + j 2b SYM_CODE_END(handle_exception) ASM_NOKPROBE(handle_exception) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 10e311b2759d..c6d5de22463f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -16,7 +16,7 @@ #ifdef CONFIG_FRAME_POINTER -extern asmlinkage void ret_from_exception(void); +extern asmlinkage void handle_exception(void); static inline int fp_is_valid(unsigned long fp, unsigned long sp) { @@ -71,7 +71,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = frame->fp; pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra, &frame->ra); - if (pc == (unsigned long)ret_from_exception) { + if (pc == (unsigned long)handle_exception) { if (unlikely(!__kernel_text_address(pc) || !fn(arg, pc))) break; -- cgit From 1d20e5d437cfebefb5e6f4d652c3a1561fc23fc7 Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Thu, 20 Jun 2024 11:34:34 +0800 Subject: riscv: signal: Remove unlikely() from WARN_ON() condition "WARN_ON(unlikely(x))" is excessive. WARN_ON() already uses unlikely() internally. Signed-off-by: Zhongqiu Han Reviewed-by: Bjorn Andersson Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20240620033434.3778156-1-quic_zhonhan@quicinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 5a2edd7f027e..dcd282419456 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -84,7 +84,7 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) datap = state + 1; /* datap is designed to be 16 byte aligned for better performance */ - WARN_ON(unlikely(!IS_ALIGNED((unsigned long)datap, 16))); + WARN_ON(!IS_ALIGNED((unsigned long)datap, 16)); get_cpu_vector_context(); riscv_v_vstate_save(¤t->thread.vstate, regs); -- cgit From b5db73fb18257cd5d9cb59bc4b779fffa629566a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 24 Jun 2024 07:53:16 +0800 Subject: riscv: enable HAVE_ARCH_STACKLEAK Add support for the stackleak feature. Whenever the kernel returns to user space the kernel stack is filled with a poison value. At the same time, disables the plugin in EFI stub code because EFI stub is out of scope for the protection. Tested on qemu and milkv duo: / # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT [ 38.675575] lkdtm: Performing direct entry STACKLEAK_ERASING [ 38.678448] lkdtm: stackleak stack usage: [ 38.678448] high offset: 288 bytes [ 38.678448] current: 496 bytes [ 38.678448] lowest: 1328 bytes [ 38.678448] tracked: 1328 bytes [ 38.678448] untracked: 448 bytes [ 38.678448] poisoned: 14312 bytes [ 38.678448] low offset: 8 bytes [ 38.689887] lkdtm: OK: the rest of the thread stack is properly erased Signed-off-by: Jisheng Zhang Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240623235316.2010-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/thread_info.h | 1 + arch/riscv/kernel/entry.S | 4 ++++ drivers/firmware/efi/libstub/Makefile | 3 ++- 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 34ae39ecb898..19ca402e6f26 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -123,6 +123,7 @@ config RISCV select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 5d473343634b..fca5c6be2b81 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -10,6 +10,7 @@ #include #include +#include /* thread information allocation */ #define THREAD_SIZE_ORDER CONFIG_THREAD_SIZE_ORDER diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index c933460ed3e9..ac2e908d4418 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -133,6 +133,10 @@ SYM_CODE_START_NOALIGN(ret_from_exception) #endif bnez s0, 1f +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + call stackleak_erase_on_task_stack +#endif + /* Save unwound kernel stack pointer in thread_info */ addi s0, sp, PT_SIZE_ON_STACK REG_S s0, TASK_TI_KERNEL_SP(tp) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 1f32d6cf98d6..f23ba62ce127 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,8 @@ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \ -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \ -DEFI_HAVE_STRCMP -fno-builtin -fpic \ $(call cc-option,-mno-single-pic-base) -cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE -mno-relax +cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE -mno-relax \ + $(DISABLE_STACKLEAK_PLUGIN) cflags-$(CONFIG_LOONGARCH) += -fpie cflags-$(CONFIG_EFI_PARAMS_FROM_FDT) += -I$(srctree)/scripts/dtc/libfdt -- cgit From 5c8405d763dc2b125b39166bc70be1b8dcc80582 Mon Sep 17 00:00:00 2001 From: Stuart Menefy Date: Sun, 30 Jun 2024 12:05:49 +0100 Subject: riscv: Extend sv39 linear mapping max size to 128G This harmonizes all virtual addressing modes which can now all map (PGDIR_SIZE * PTRS_PER_PGD) / 4 of physical memory. The RISCV implementation of KASAN requires that the boundary between shallow mappings are aligned on an 8G boundary. In this case we need VMALLOC_START to be 8G aligned. So although we only need to move the start of the linear mapping down by 4GiB to allow 128GiB to be mapped, we actually move it down by 8GiB (creating a 4GiB hole between the linear mapping and KASAN shadow space) to maintain the alignment requirement. Signed-off-by: Stuart Menefy Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240630110550.1731929-1-stuart.menefy@codasip.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/vm-layout.rst | 11 ++++++----- arch/riscv/include/asm/page.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/arch/riscv/vm-layout.rst b/Documentation/arch/riscv/vm-layout.rst index e476b4386bd9..077b968dcc81 100644 --- a/Documentation/arch/riscv/vm-layout.rst +++ b/Documentation/arch/riscv/vm-layout.rst @@ -47,11 +47,12 @@ RISC-V Linux Kernel SV39 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffffffc6fea00000 | -228 GB | ffffffc6feffffff | 6 MB | fixmap - ffffffc6ff000000 | -228 GB | ffffffc6ffffffff | 16 MB | PCI io - ffffffc700000000 | -228 GB | ffffffc7ffffffff | 4 GB | vmemmap - ffffffc800000000 | -224 GB | ffffffd7ffffffff | 64 GB | vmalloc/ioremap space - ffffffd800000000 | -160 GB | fffffff6ffffffff | 124 GB | direct mapping of all physical memory + ffffffc4fea00000 | -236 GB | ffffffc4feffffff | 6 MB | fixmap + ffffffc4ff000000 | -236 GB | ffffffc4ffffffff | 16 MB | PCI io + ffffffc500000000 | -236 GB | ffffffc5ffffffff | 4 GB | vmemmap + ffffffc600000000 | -232 GB | ffffffd5ffffffff | 64 GB | vmalloc/ioremap space + ffffffd600000000 | -168 GB | fffffff5ffffffff | 128 GB | direct mapping of all physical memory + | | | | fffffff700000000 | -36 GB | fffffffeffffffff | 32 GB | kasan __________________|____________|__________________|_________|____________________________________________________________ | diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 235fd45d998d..7ede2111c591 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -37,7 +37,7 @@ * define the PAGE_OFFSET value for SV48 and SV39. */ #define PAGE_OFFSET_L4 _AC(0xffffaf8000000000, UL) -#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) +#define PAGE_OFFSET_L3 _AC(0xffffffd600000000, UL) #else #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) #endif /* CONFIG_64BIT */ -- cgit From 52420e483d3e1562f11a208d3c540b27b5e5dbf4 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 2 Jul 2024 11:37:31 +0800 Subject: RISC-V: Provide the frequency of time CSR via hwprobe The RISC-V architecture makes a real time counter CSR (via RDTIME instruction) available for applications in U-mode but there is no architected mechanism for an application to discover the frequency the counter is running at. Some applications (e.g., DPDK) use the time counter for basic performance analysis as well as fine grained time-keeping. Add support to the hwprobe system call to export the time CSR frequency to code running in U-mode. Signed-off-by: Yunhui Cui Reviewed-by: Evan Green Reviewed-by: Anup Patel Acked-by: Punit Agrawal Link: https://lore.kernel.org/r/20240702033731.71955-2-cuiyunhui@bytedance.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/hwprobe.rst | 2 ++ arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/kernel/sys_hwprobe.c | 5 +++++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 02eb4d98b7de..3db60a0911df 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -264,3 +264,5 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS`: An unsigned long which represent the highest userspace virtual address usable. + +* :c:macro:`RISCV_HWPROBE_KEY_TIME_CSR_FREQ`: Frequency (in Hz) of `time CSR`. diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 150a9877b0af..ef01c182af2b 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 7 +#define RISCV_HWPROBE_MAX_KEY 8 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 8b8f6ac0eae2..b706c8e47b02 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -81,6 +81,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 #define RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS 7 +#define RISCV_HWPROBE_KEY_TIME_CSR_FREQ 8 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 8a1c9ce170e8..8d1b5c35d2a7 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +237,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, pair->value = user_max_virt_addr(); break; + case RISCV_HWPROBE_KEY_TIME_CSR_FREQ: + pair->value = riscv_timebase; + break; + /* * For forward compatibility, unknown keys don't fail the whole * call, but get their element key set to -1 and value set to 0 -- cgit From 225990c487c1023e7b3aa89beb6a68011fbc0461 Mon Sep 17 00:00:00 2001 From: Mark Mentovai Date: Thu, 25 Jul 2024 16:41:44 -0400 Subject: net: phy: realtek: add support for RTL8366S Gigabit PHY The PHY built in to the Realtek RTL8366S switch controller was previously supported by genphy_driver. This PHY does not implement MMD operations. Since commit 9b01c885be36 ("net: phy: c22: migrate to genphy_c45_write_eee_adv()"), MMD register reads have been made during phy_probe to determine EEE support. For genphy_driver, these reads are transformed into 802.3 annex 22D clause 45-over-clause 22 mmd_phy_indirect operations that perform MII register writes to MII_MMD_CTRL and MII_MMD_DATA. This overwrites those two MII registers, which on this PHY are reserved and have another function, rendering the PHY unusable while so configured. Proper support for this PHY is restored by providing a phy_driver that declares MMD operations as unsupported by using the helper functions provided for that purpose, while remaining otherwise identical to genphy_driver. Fixes: 9b01c885be36 ("net: phy: c22: migrate to genphy_c45_write_eee_adv()") Reported-by: Russell Senior Closes: https://github.com/openwrt/openwrt/issues/15981 Link: https://github.com/openwrt/openwrt/issues/15739 Signed-off-by: Mark Mentovai Reviewed-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index bed839237fb5..87865918dab6 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -1465,6 +1465,13 @@ static struct phy_driver realtek_drvs[] = { .handle_interrupt = genphy_handle_interrupt_no_ack, .suspend = genphy_suspend, .resume = genphy_resume, + }, { + PHY_ID_MATCH_EXACT(0x001cc960), + .name = "RTL8366S Gigabit Ethernet", + .suspend = genphy_suspend, + .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, }; -- cgit From e975d955c07cbc2cd6a83a5d8235d8373441fdb9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 26 Jul 2024 14:49:41 +0100 Subject: regulator: Further restrict RZG2L USB VBCTRL regulator dependencies Since the regulator can't be used without the USB controller also tighten the dependency to match, as well as the default. Reviewed-by: Biju Das Signed-off-by: Mark Brown Link: https://patch.msgid.link/20240726-regulator-restrict-rzg2l-v1-1-640e508896e2@kernel.org Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 5be44606ac15..5560dc4db986 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1636,7 +1636,7 @@ config REGULATOR_UNIPHIER config REGULATOR_RZG2L_VBCTRL tristate "Renesas RZ/G2L USB VBUS regulator driver" - depends on ARCH_RZG2L || COMPILE_TEST + depends on RESET_RZG2L_USBPHY_CTRL || COMPILE_TEST depends on OF select REGMAP_MMIO default RESET_RZG2L_USBPHY_CTRL -- cgit From 342b2e395d5f34c9f111a818556e617939f83a8c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Jul 2024 15:24:30 +0100 Subject: io_uring/napi: use ktime in busy polling It's more natural to use ktime/ns instead of keeping around usec, especially since we're comparing it against user provided timers, so convert napi busy poll internal handling to ktime. It's also nicer since the type (ktime_t vs unsigned long) now tells the unit of measure. Keep everything as ktime, which we convert to/from micro seconds for IORING_[UN]REGISTER_NAPI. The net/ busy polling works seems to work with usec, however it's not real usec as shift by 10 is used to get it from nsecs, see busy_loop_current_time(), so it's easy to get truncated nsec back and we get back better precision. Note, we can further improve it later by removing the truncation and maybe convincing net/ to use ktime/ns instead. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/95e7ec8d095069a3ed5d40a4bc6f8b586698bc7e.1722003776.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.h | 2 +- io_uring/napi.c | 48 ++++++++++++++++++++++++------------------ io_uring/napi.h | 2 +- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e62aa9f0629f..3315005df117 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -404,7 +404,7 @@ struct io_ring_ctx { spinlock_t napi_lock; /* napi_list lock */ /* napi busy poll default timeout */ - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; bool napi_enabled; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e1ce908f0679..c2acf6180845 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -43,7 +43,7 @@ struct io_wait_queue { ktime_t timeout; #ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; #endif }; diff --git a/io_uring/napi.c b/io_uring/napi.c index 327e5f3a8abe..6bdb267e9c33 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -33,6 +33,12 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, return NULL; } +static inline ktime_t net_to_ktime(unsigned long t) +{ + /* napi approximating usecs, reverse busy_loop_current_time */ + return ns_to_ktime(t << 10); +} + void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) { struct hlist_head *hash_list; @@ -102,14 +108,14 @@ static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) __io_napi_remove_stale(ctx); } -static inline bool io_napi_busy_loop_timeout(unsigned long start_time, - unsigned long bp_usec) +static inline bool io_napi_busy_loop_timeout(ktime_t start_time, + ktime_t bp) { - if (bp_usec) { - unsigned long end_time = start_time + bp_usec; - unsigned long now = busy_loop_current_time(); + if (bp) { + ktime_t end_time = ktime_add(start_time, bp); + ktime_t now = net_to_ktime(busy_loop_current_time()); - return time_after(now, end_time); + return ktime_after(now, end_time); } return true; @@ -124,7 +130,8 @@ static bool io_napi_busy_loop_should_end(void *data, return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; - if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + if (io_napi_busy_loop_timeout(net_to_ktime(start_time), + iowq->napi_busy_poll_dt)) return true; return false; @@ -181,10 +188,12 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, */ void io_napi_init(struct io_ring_ctx *ctx) { + u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; + INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; - ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); + ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); } /* @@ -217,7 +226,7 @@ void io_napi_free(struct io_ring_ctx *ctx) int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; struct io_uring_napi napi; @@ -232,7 +241,7 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); WRITE_ONCE(ctx->napi_enabled, true); return 0; @@ -249,14 +258,14 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); WRITE_ONCE(ctx->napi_enabled, false); return 0; @@ -275,23 +284,20 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct timespec64 *ts) { - unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); + ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); if (ts) { struct timespec64 poll_to_ts; - poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + poll_to_ts = ns_to_timespec64(ktime_to_ns(poll_dt)); if (timespec64_compare(ts, &poll_to_ts) < 0) { s64 poll_to_ns = timespec64_to_ns(ts); - if (poll_to_ns > 0) { - u64 val = poll_to_ns + 999; - do_div(val, 1000); - poll_to = val; - } + if (poll_to_ns > 0) + poll_dt = ns_to_ktime(poll_to_ns); } } - iowq->napi_busy_poll_to = poll_to; + iowq->napi_busy_poll_dt = poll_dt; } /* @@ -320,7 +326,7 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) LIST_HEAD(napi_list); bool is_stale = false; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; diff --git a/io_uring/napi.h b/io_uring/napi.h index 6fc0393d0dbe..babbee36cd3e 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -55,7 +55,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return; sock = sock_from_file(req->file); -- cgit From 358169617602f6f71b31e5c9532a09b95a34b043 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Jul 2024 15:24:31 +0100 Subject: io_uring/napi: pass ktime to io_napi_adjust_timeout Pass the waiting time for __io_napi_adjust_timeout as ktime and get rid of all timespec64 conversions. It's especially simpler since the caller already have a ktime. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4f5b8e8eed4f53a1879e031a6712b25381adc23d.1722003776.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ++++-- io_uring/napi.c | 14 +++----------- io_uring/napi.h | 8 ++++---- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2626424f5d73..3942db160f18 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2416,12 +2416,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (uts) { struct timespec64 ts; + ktime_t dt; if (get_timespec64(&ts, uts)) return -EFAULT; - iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - io_napi_adjust_timeout(ctx, &iowq, &ts); + dt = timespec64_to_ktime(ts); + iowq.timeout = ktime_add(dt, ktime_get()); + io_napi_adjust_timeout(ctx, &iowq, dt); } if (sig) { diff --git a/io_uring/napi.c b/io_uring/napi.c index 6bdb267e9c33..4fd6bb331e1e 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -282,20 +282,12 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) * the NAPI timeout accordingly. */ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); - if (ts) { - struct timespec64 poll_to_ts; - - poll_to_ts = ns_to_timespec64(ktime_to_ns(poll_dt)); - if (timespec64_compare(ts, &poll_to_ts) < 0) { - s64 poll_to_ns = timespec64_to_ns(ts); - if (poll_to_ns > 0) - poll_dt = ns_to_ktime(poll_to_ns); - } - } + if (to_wait) + poll_dt = min(poll_dt, to_wait); iowq->napi_busy_poll_dt = poll_dt; } diff --git a/io_uring/napi.h b/io_uring/napi.h index babbee36cd3e..88f1c21d5548 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -18,7 +18,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, struct timespec64 *ts); + struct io_wait_queue *iowq, ktime_t to_wait); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -29,11 +29,11 @@ static inline bool io_napi(struct io_ring_ctx *ctx) static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { if (!io_napi(ctx)) return; - __io_napi_adjust_timeout(ctx, iowq, ts); + __io_napi_adjust_timeout(ctx, iowq, to_wait); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, @@ -88,7 +88,7 @@ static inline void io_napi_add(struct io_kiocb *req) } static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, -- cgit From e22a3a9d4134d7e6351a2998771522e74bcc58da Mon Sep 17 00:00:00 2001 From: Kiran K Date: Wed, 3 Jul 2024 14:22:42 +0530 Subject: Bluetooth: btintel: Fail setup on error Do not attempt to send any hci command to controller if *setup* function fails. Fixes: af395330abed ("Bluetooth: btintel: Add Intel devcoredump support") Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index e7a612504ab1..2ebc970e6573 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -3085,6 +3085,9 @@ static int btintel_setup_combined(struct hci_dev *hdev) btintel_set_dsm_reset_method(hdev, &ver_tlv); err = btintel_bootloader_setup_tlv(hdev, &ver_tlv); + if (err) + goto exit_error; + btintel_register_devcoredump_support(hdev); btintel_print_fseq_info(hdev); break; -- cgit From d09009bc80d9d0d812b988888c40cd86e52eaf1e Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 16 Jul 2024 15:49:47 +0800 Subject: Bluetooth: btmtk: Fix kernel crash when entering btmtk_usb_suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If MediaTek's Bluetooth setup is unsuccessful, a NULL pointer issue occur when the system is suspended and the anchored kill function is called. To avoid this, add protection to prevent executing the anchored kill function if the setup is unsuccessful. [ 6.922106] Hardware name: Acer Tomato (rev2) board (DT) [ 6.922114] Workqueue: pm pm_runtime_work [ 6.922132] pstate: 804000c9 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 6.922147] pc : usb_kill_anchored_urbs+0x6c/0x1e0 [ 6.922164] lr : usb_kill_anchored_urbs+0x48/0x1e0 [ 6.922181] sp : ffff800080903b60 [ 6.922187] x29: ffff800080903b60 x28: ffff2c7b85c32b80 x27: ffff2c7bbb370930 [ 6.922211] x26: 00000000000f4240 x25: 00000000ffffffff x24: ffffd49ece2dcb48 [ 6.922255] x20: ffffffffffffffd8 x19: 0000000000000000 x18: 0000000000000006 [ 6.922276] x17: 6531656337386238 x16: 3632373862333863 x15: ffff800080903480 [ 6.922297] x14: 0000000000000000 x13: 303278302f303178 x12: ffffd49ecf090e30 [ 6.922318] x11: 0000000000000001 x10: 0000000000000001 x9 : ffffd49ecd2c5bb4 [ 6.922339] x8 : c0000000ffffdfff x7 : ffffd49ecefe0db8 x6 : 00000000000affa8 [ 6.922360] x5 : ffff2c7bbb35dd48 x4 : 0000000000000000 x3 : 0000000000000000 [ 6.922379] x2 : 0000000000000000 x1 : 0000000000000003 x0 : ffffffffffffffd8 [ 6.922400] Call trace: [ 6.922405] usb_kill_anchored_urbs+0x6c/0x1e0 [ 6.922422] btmtk_usb_suspend+0x20/0x38 [btmtk 5f200a97badbdfda4266773fee49acfc8e0224d5] [ 6.922444] btusb_suspend+0xd0/0x210 [btusb 0bfbf19a87ff406c83b87268b87ce1e80e9a829b] [ 6.922469] usb_suspend_both+0x90/0x288 [ 6.922487] usb_runtime_suspend+0x3c/0xa8 [ 6.922507] __rpm_callback+0x50/0x1f0 [ 6.922523] rpm_callback+0x70/0x88 [ 6.922538] rpm_suspend+0xe4/0x5a0 [ 6.922553] pm_runtime_work+0xd4/0xe0 [ 6.922569] process_one_work+0x18c/0x440 [ 6.922588] worker_thread+0x314/0x428 [ 6.922606] kthread+0x128/0x138 [ 6.922621] ret_from_fork+0x10/0x20 [ 6.922644] Code: f100a274 54000520 d503201f d100a260 (b8370000) [ 6.922654] ---[ end trace 0000000000000000 ]--- Fixes: ceac1cb0259d ("Bluetooth: btusb: mediatek: add ISO data transmission functions") Signed-off-by: Chris Lu Reported-by: Nícolas F. R. A. Prado #KernelCI Tested-by: Nícolas F. R. A. Prado Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index b7c348687a77..191bc6925120 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1262,7 +1262,8 @@ int btmtk_usb_suspend(struct hci_dev *hdev) struct btmtk_data *btmtk_data = hci_get_priv(hdev); /* Stop urb anchor for iso data transmission */ - usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor); + if (test_bit(BTMTK_ISOPKT_RUNNING, &btmtk_data->flags)) + usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor); return 0; } -- cgit From 96b82af36efaa1787946e021aa3dc5410c05beeb Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 15 Jul 2024 10:40:03 -0400 Subject: Bluetooth: hci_sync: Fix suspending with wrong filter policy When suspending the scan filter policy cannot be 0x00 (no acceptlist) since that means the host has to process every advertisement report waking up the system, so this attempts to check if hdev is marked as suspended and if the resulting filter policy would be 0x00 (no acceptlist) then skip passive scanning if thre no devices in the acceptlist otherwise reset the filter policy to 0x01 so the acceptlist is used since the devices programmed there can still wakeup be system. Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index cd2ed16da8a4..a31d39a821f4 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2976,6 +2976,27 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) */ filter_policy = hci_update_accept_list_sync(hdev); + /* If suspended and filter_policy set to 0x00 (no acceptlist) then + * passive scanning cannot be started since that would require the host + * to be woken up to process the reports. + */ + if (hdev->suspended && !filter_policy) { + /* Check if accept list is empty then there is no need to scan + * while suspended. + */ + if (list_empty(&hdev->le_accept_list)) + return 0; + + /* If there are devices is the accept_list that means some + * devices could not be programmed which in non-suspended case + * means filter_policy needs to be set to 0x00 so the host needs + * to filter, but since this is treating suspended case we + * can ignore device needing host to filter to allow devices in + * the acceptlist to be able to wakeup the system. + */ + filter_policy = 0x01; + } + /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support -- cgit From f0c83a23fcbb424fdff5b38fbcdda3c04003a210 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Fri, 19 Jul 2024 11:30:19 +0800 Subject: Bluetooth: btmtk: Fix btmtk.c undefined reference build error MediaTek moved some usb interface related function to btmtk.c which may cause build failed if BT USB Kconfig wasn't enabled. Fix undefined reference by adding config check. btmtk.c:(.text+0x89c): undefined reference to `usb_alloc_urb' btmtk.c:(.text+0x8e3): undefined reference to `usb_free_urb' btmtk.c:(.text+0x956): undefined reference to `usb_free_urb' btmtk.c:(.text+0xa0e): undefined reference to `usb_anchor_urb' btmtk.c:(.text+0xb43): undefined reference to `usb_autopm_get_interface' btmtk.c:(.text+0xb7e): undefined reference to `usb_autopm_put_interface' btmtk.c:(.text+0xf70): undefined reference to `usb_disable_autosuspend' btmtk.c:(.text+0x133a): undefined reference to `usb_control_msg' Fixes: d019930b0049 ("Bluetooth: btmtk: move btusb_mtk_hci_wmt_sync to btmtk.c") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407091928.AH0aGZnx-lkp@intel.com/ Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 2 ++ drivers/bluetooth/btmtk.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 191bc6925120..2b7c80043aa2 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -437,6 +437,7 @@ int btmtk_process_coredump(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(btmtk_process_coredump); +#if IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) static void btmtk_usb_wmt_recv(struct urb *urb) { struct hci_dev *hdev = urb->context; @@ -1488,6 +1489,7 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); +#endif MODULE_AUTHOR("Sean Wang "); MODULE_AUTHOR("Mark Chen "); diff --git a/drivers/bluetooth/btmtk.h b/drivers/bluetooth/btmtk.h index 5df7c3296624..6fc69cd8636b 100644 --- a/drivers/bluetooth/btmtk.h +++ b/drivers/bluetooth/btmtk.h @@ -202,6 +202,7 @@ int btmtk_process_coredump(struct hci_dev *hdev, struct sk_buff *skb); void btmtk_fw_get_filename(char *buf, size_t size, u32 dev_id, u32 fw_ver, u32 fw_flavor); +#if IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) int btmtk_usb_subsys_reset(struct hci_dev *hdev, u32 dev_id); int btmtk_usb_recv_acl(struct hci_dev *hdev, struct sk_buff *skb); @@ -216,6 +217,7 @@ int btmtk_usb_suspend(struct hci_dev *hdev); int btmtk_usb_setup(struct hci_dev *hdev); int btmtk_usb_shutdown(struct hci_dev *hdev); +#endif #else static inline int btmtk_set_bdaddr(struct hci_dev *hdev, -- cgit From 61f7a8f975456d7be21100ee0936389142b95a81 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2024 11:27:05 +0200 Subject: Bluetooth: btmtk: Fix btmtk.c undefined reference build error harder The previous fix was incomplete as the link failure still persists with CONFIG_USB=m when the sdio or serial wrappers for btmtk.c are build-in: btmtk.c:(.text+0x468): undefined reference to `usb_alloc_urb' btmtk.c:(.text+0x488): undefined reference to `usb_free_urb' btmtk.c:(.text+0x500): undefined reference to `usb_anchor_urb' btmtk.c:(.text+0x50a): undefined reference to `usb_submit_urb' btmtk.c:(.text+0x92c): undefined reference to `usb_control_msg' btmtk.c:(.text+0xa92): undefined reference to `usb_unanchor_urb' btmtk.c:(.text+0x11e4): undefined reference to `usb_set_interface' btmtk.c:(.text+0x120a): undefined reference to `usb_kill_anchored_urbs' Disallow this configuration. Fixes: f0c83a23fcbb ("Bluetooth: btmtk: Fix btmtk.c undefined reference build error") Signed-off-by: Arnd Bergmann Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index 90a94a111e67..769fa288179d 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -413,6 +413,7 @@ config BT_ATH3K config BT_MTKSDIO tristate "MediaTek HCI SDIO driver" depends on MMC + depends on USB || !BT_HCIBTUSB_MTK select BT_MTK help MediaTek Bluetooth HCI SDIO driver. @@ -425,6 +426,7 @@ config BT_MTKSDIO config BT_MTKUART tristate "MediaTek HCI UART driver" depends on SERIAL_DEV_BUS + depends on USB || !BT_HCIBTUSB_MTK select BT_MTK help MediaTek Bluetooth HCI UART driver. -- cgit From 7a8c6fb21a7c913ddb99785b14914dab2f934fbd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2024 11:27:06 +0200 Subject: Bluetooth: btmtk: remove #ifdef around declarations The caller of these functions in btusb.c is guarded with an if(IS_ENABLED()) style check, so dead code is left out, but the declarations are still needed at compile time: drivers/bluetooth/btusb.c: In function 'btusb_mtk_reset': drivers/bluetooth/btusb.c:2705:15: error: implicit declaration of function 'btmtk_usb_subsys_reset' [-Wimplicit-function-declaration] 2705 | err = btmtk_usb_subsys_reset(hdev, btmtk_data->dev_id); | ^~~~~~~~~~~~~~~~~~~~~~ drivers/bluetooth/btusb.c: In function 'btusb_send_frame_mtk': drivers/bluetooth/btusb.c:2720:23: error: implicit declaration of function 'alloc_mtk_intr_urb' [-Wimplicit-function-declaration] 2720 | urb = alloc_mtk_intr_urb(hdev, skb, btusb_tx_complete); | ^~~~~~~~~~~~~~~~~~ drivers/bluetooth/btusb.c:2720:21: error: assignment to 'struct urb *' from 'int' makes pointer from integer without a cast [-Wint-conversion] 2720 | urb = alloc_mtk_intr_urb(hdev, skb, btusb_tx_complete); | ^ Fixes: f0c83a23fcbb ("Bluetooth: btmtk: Fix btmtk.c undefined reference build error") Signed-off-by: Arnd Bergmann Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/bluetooth/btmtk.h b/drivers/bluetooth/btmtk.h index 6fc69cd8636b..5df7c3296624 100644 --- a/drivers/bluetooth/btmtk.h +++ b/drivers/bluetooth/btmtk.h @@ -202,7 +202,6 @@ int btmtk_process_coredump(struct hci_dev *hdev, struct sk_buff *skb); void btmtk_fw_get_filename(char *buf, size_t size, u32 dev_id, u32 fw_ver, u32 fw_flavor); -#if IS_ENABLED(CONFIG_BT_HCIBTUSB_MTK) int btmtk_usb_subsys_reset(struct hci_dev *hdev, u32 dev_id); int btmtk_usb_recv_acl(struct hci_dev *hdev, struct sk_buff *skb); @@ -217,7 +216,6 @@ int btmtk_usb_suspend(struct hci_dev *hdev); int btmtk_usb_setup(struct hci_dev *hdev); int btmtk_usb_shutdown(struct hci_dev *hdev); -#endif #else static inline int btmtk_set_bdaddr(struct hci_dev *hdev, -- cgit From df3d6a3e01fd82cb74b6bb309f7be71e728a3448 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 25 Jul 2024 18:28:08 -0400 Subject: Bluetooth: hci_event: Fix setting DISCOVERY_FINDING for passive scanning DISCOVERY_FINDING shall only be set for active scanning as passive scanning is not meant to generate MGMT Device Found events causing discovering state to go out of sync since userspace would believe it is discovering when in fact it is just passive scanning. Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=219088 Fixes: 2e2515c1ba38 ("Bluetooth: hci_event: Set DISCOVERY_FINDING on SCAN_ENABLED") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 7 ------- net/bluetooth/hci_event.c | 5 +++-- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8a4ebd93adfc..06da8ac13dca 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -119,13 +119,6 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state) case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: - /* If discovery was not started then it was initiated by the - * MGMT interface so no MGMT event shall be generated either - */ - if (old_state != DISCOVERY_STARTING) { - hdev->discovery.state = old_state; - return; - } mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index dce8035ca799..d0c118c47f6c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1721,9 +1721,10 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) switch (enable) { case LE_SCAN_ENABLE: hci_dev_set_flag(hdev, HCI_LE_SCAN); - if (hdev->le_scan_type == LE_SCAN_ACTIVE) + if (hdev->le_scan_type == LE_SCAN_ACTIVE) { clear_pending_adv_report(hdev); - hci_discovery_set_state(hdev, DISCOVERY_FINDING); + hci_discovery_set_state(hdev, DISCOVERY_FINDING); + } break; case LE_SCAN_DISABLE: -- cgit From 4c7be57f2772c8ce4e1e43c6a79b8f8d401a4795 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Jul 2024 13:21:02 -0700 Subject: arm64: allow installing compressed image by default On arm64 we build compressed images, but "make install" by default will install the old non-compressed one. To actually get the compressed image install, you need to use "make zinstall", which is not the usual way to install a kernel. Which may not sound like much of an issue, but when you deal with multiple architectures (and years of your fingers knowing the regular "make install" incantation), this inconsistency is pretty annoying. But as Will Deacon says: "Sadly, bootloaders being as top quality as you might expect, I don't think we're in a position to rely on decompressor support across the board. Our Image.gz is literally just that -- we don't have a built-in decompressor (nor do I think we want to rush into that again after the fun we had on arm32) and the recent EFI zboot support solves that problem for platforms using EFI. Changing the default 'install' target terrifies me. There are bound to be folks with embedded boards who've scripted this and we could really ruin their day if we quietly give them a compressed kernel that their bootloader doesn't know how to handle :/" So make this conditional on a new "COMPRESSED_INSTALL" option. Cc: Catalin Marinas Acked-by: Will Deacon Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/Makefile | 10 ++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7fd70be0463f..d84c5f905b1a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2337,6 +2337,17 @@ config EFI allow the kernel to be booted as an EFI application. This is only useful on systems that have UEFI firmware. +config COMPRESSED_INSTALL + bool "Install compressed image by default" + help + This makes the regular "make install" install the compressed + image we built, not the legacy uncompressed one. + + You can check that a compressed image works for you by doing + "make zinstall" first, and verifying that everything is fine + in your environment before making "make install" do this for + you. + config DMI bool "Enable support for SMBIOS (DMI) tables" depends on EFI diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 3f0f35fd5bb7..f6bc3da1ef11 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -182,7 +182,13 @@ $(BOOT_TARGETS): vmlinux Image.%: Image $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ -install: KBUILD_IMAGE := $(boot)/Image +ifeq ($(CONFIG_COMPRESSED_INSTALL),y) + DEFAULT_KBUILD_IMAGE = $(KBUILD_IMAGE) +else + DEFAULT_KBUILD_IMAGE = $(boot)/Image +endif + +install: KBUILD_IMAGE := $(DEFAULT_KBUILD_IMAGE) install zinstall: $(call cmd,install) @@ -229,7 +235,7 @@ define archhelp echo '* Image.gz - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)' echo ' Image - Uncompressed kernel image (arch/$(ARCH)/boot/Image)' echo ' image.fit - Flat Image Tree (arch/$(ARCH)/boot/image.fit)' - echo ' install - Install uncompressed kernel' + echo ' install - Install kernel (compressed if COMPRESSED_INSTALL set)' echo ' zinstall - Install compressed kernel' echo ' Install using (your) ~/bin/installkernel or' echo ' (distribution) /sbin/installkernel or' -- cgit From 5779d398dbcd74c30a641c209946b8498e668a53 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 23 Jul 2024 18:12:40 -0500 Subject: smb3: add dynamic tracepoint for reflink errors There are cases where debugging clone_range ("smb2_duplicate_extents" function) and in the future copy_range ("smb2_copychunk_range") can be helpful. Add dynamic trace points for any errors in clone, and a followon patch will add them for copychunk. "trace-cmd record -e smb3_clone_err" Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 5 +++++ fs/smb/client/trace.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 7fe59235f090..1539463285ee 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2075,6 +2075,11 @@ smb2_duplicate_extents(const unsigned int xid, cifs_dbg(FYI, "Non-zero response length in duplicate extents\n"); duplicate_extents_out: + if (rc) + trace_smb3_clone_err(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, + tcon->tid, tcon->ses->Suid, src_off, + dest_off, len, rc); return rc; } diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 36d5295c2a6f..b027282d6db7 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -206,6 +206,63 @@ DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err); DEFINE_SMB3_OTHER_ERR_EVENT(zero_err); DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err); +/* + * For logging errors in reflink and copy_range ops e.g. smb2_copychunk_range + * and smb2_duplicate_extents + */ +DECLARE_EVENT_CLASS(smb3_copy_range_err_class, + TP_PROTO(unsigned int xid, + __u64 src_fid, + __u64 target_fid, + __u32 tid, + __u64 sesid, + __u64 src_offset, + __u64 target_offset, + __u32 len, + int rc), + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, src_fid) + __field(__u64, target_fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, src_offset) + __field(__u64, target_offset) + __field(__u32, len) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->src_fid = src_fid; + __entry->target_fid = target_fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->src_offset = src_offset; + __entry->target_offset = target_offset; + __entry->len = len; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, + __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc) +) + +#define DEFINE_SMB3_COPY_RANGE_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 src_fid, \ + __u64 target_fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 src_offset, \ + __u64 target_offset, \ + __u32 len, \ + int rc), \ + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc)) + +DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err); +/* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */ /* For logging successful read or write */ DECLARE_EVENT_CLASS(smb3_rw_done_class, -- cgit From 6629f87b97e0740431b7b29b8dfdfa9d842c4bc5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 24 Jul 2024 11:57:18 -0500 Subject: smb3: add four dynamic tracepoints for copy_file_range and reflink Add more dynamic tracepoints to help debug copy_file_range (copychunk) and clone_range ("duplicate extents"). These are tracepoints for entering the function and completing without error. For example: "trace-cmd record -e smb3_copychunk_enter -e smb3_copychunk_done" or "trace-cmd record -e smb3_clone_enter -e smb3_clone_done" Here is sample output: TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | cp-5964 [005] ..... 2176.168977: smb3_clone_enter: xid=17 sid=0xeb275be4 tid=0x7ffa7cdb source fid=0x1ed02e15 source offset=0x0 target fid=0x1ed02e15 target offset=0x0 len=0xa0000 cp-5964 [005] ..... 2176.170668: smb3_clone_done: xid=17 sid=0xeb275be4 tid=0x7ffa7cdb source fid=0x1ed02e15 source offset=0x0 target fid=0x1ed02e15 target offset=0x0 len=0xa0000 Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 15 +++++++++++++- fs/smb/client/trace.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 1539463285ee..322cabc69c6f 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1812,6 +1812,10 @@ smb2_copychunk_range(const unsigned int xid, tcon = tlink_tcon(trgtfile->tlink); + trace_smb3_copychunk_enter(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); + while (len > 0) { pcchunk->SourceOffset = cpu_to_le64(src_off); pcchunk->TargetOffset = cpu_to_le64(dest_off); @@ -1863,6 +1867,9 @@ smb2_copychunk_range(const unsigned int xid, le32_to_cpu(retbuf->ChunksWritten), le32_to_cpu(retbuf->ChunkBytesWritten), bytes_written); + trace_smb3_copychunk_done(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); } else if (rc == -EINVAL) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) goto cchunk_out; @@ -2046,7 +2053,9 @@ smb2_duplicate_extents(const unsigned int xid, dup_ext_buf.ByteCount = cpu_to_le64(len); cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n", src_off, dest_off, len); - + trace_smb3_clone_enter(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); inode = d_inode(trgtfile->dentry); if (inode->i_size < dest_off + len) { rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); @@ -2080,6 +2089,10 @@ duplicate_extents_out: trgtfile->fid.volatile_fid, tcon->tid, tcon->ses->Suid, src_off, dest_off, len, rc); + else + trace_smb3_clone_done(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); return rc; } diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index b027282d6db7..bc4f8b3ad6ff 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -264,6 +264,59 @@ DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \ DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err); /* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */ +DECLARE_EVENT_CLASS(smb3_copy_range_done_class, + TP_PROTO(unsigned int xid, + __u64 src_fid, + __u64 target_fid, + __u32 tid, + __u64 sesid, + __u64 src_offset, + __u64 target_offset, + __u32 len), + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, src_fid) + __field(__u64, target_fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, src_offset) + __field(__u64, target_offset) + __field(__u32, len) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->src_fid = src_fid; + __entry->target_fid = target_fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->src_offset = src_offset; + __entry->target_offset = target_offset; + __entry->len = len; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, + __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len) +) + +#define DEFINE_SMB3_COPY_RANGE_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_copy_range_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 src_fid, \ + __u64 target_fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 src_offset, \ + __u64 target_offset, \ + __u32 len), \ + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len)) + +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_enter); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_enter); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_done); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_done); + + /* For logging successful read or write */ DECLARE_EVENT_CLASS(smb3_rw_done_class, TP_PROTO(unsigned int rreq_debug_id, -- cgit From b6f6a7aa689f1c255e06fee3ca13c9f9e5c12780 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 26 Jul 2024 01:06:20 -0500 Subject: smb3: add dynamic trace point for session setup key expired failures There are cases where services need to remount (or change their credentials files) when keys have expired, but it can be helpful to have a dynamic trace point to make it easier to notify the service to refresh the storage account key. Here is sample output, one from mount with bad password, one from a reconnect where the password has been changed or expired and reconnect fails (requiring remount with new storage account key) TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | mount.cifs-11362 [000] ..... 6000.241620: smb3_key_expired: rc=-13 user=testpassu conn_id=0x2 server=localhost addr=127.0.0.1:445 kworker/4:0-8458 [004] ..... 6044.892283: smb3_key_expired: rc=-13 user=testpassu conn_id=0x3 server=localhost addr=127.0.0.1:445 Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 8 +++++++- fs/smb/client/trace.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9fc5b11c0b6c..9a06b5594669 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1562,8 +1562,14 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cifs_small_buf_release(sess_data->iov[0].iov_base); if (rc == 0) sess_data->ses->expired_pwd = false; - else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) + else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + if (sess_data->ses->expired_pwd == false) + trace_smb3_key_expired(sess_data->server->hostname, + sess_data->ses->user_name, + sess_data->server->conn_id, + &sess_data->server->dstaddr, rc); sess_data->ses->expired_pwd = true; + } memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index bc4f8b3ad6ff..6b3bdfb97211 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -1281,6 +1281,46 @@ DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \ DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err); +DECLARE_EVENT_CLASS(smb3_sess_setup_err_class, + TP_PROTO(char *hostname, char *username, __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr, int rc), + TP_ARGS(hostname, username, conn_id, dst_addr, rc), + TP_STRUCT__entry( + __string(hostname, hostname) + __string(username, username) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + __field(int, rc) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + __entry->rc = rc; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname); + __assign_str(username); + ), + TP_printk("rc=%d user=%s conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->rc, + __get_str(username), + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_SES_SETUP_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_sess_setup_err_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + char *username, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr, \ + int rc), \ + TP_ARGS(hostname, username, conn_id, addr, rc)) + +DEFINE_SMB3_SES_SETUP_ERR_EVENT(key_expired); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, __u64 conn_id, -- cgit From 52e130764ab6bdc439bcf124ac3e15f52ca0c8e5 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 10 Jul 2024 11:34:38 +0200 Subject: tools/power turbostat: Move debug prints from stdout to stderr This leaves the stdout cleaner, having only counter data. It makes it easier for programs to parse the output of turbostat, for example selftests. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index be345a4bbe96..cccd9f4311a0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1984,6 +1984,7 @@ void help(void) " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " debug messages are printed to stderr\n" " -D, --Dump displays the raw counter values\n" " -e, --enable [all | column]\n" " shows all or the specified disabled column\n" @@ -8262,7 +8263,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) pinfo->scale = perf_scale; if (debug) - printf("Add perf/%s/%s cpu%d: %d\n", + fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); } @@ -8422,7 +8423,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) for (mp = head; mp; mp = mp->next) { if (debug) - printf("%s: %s %s\n", __func__, name, mp->name); + fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name); if (!strncmp(name, mp->name, strlen(mp->name))) return mp; } @@ -8439,8 +8440,8 @@ int add_counter(unsigned int msr_num, char *path, char *name, errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); if (debug) - printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, - path, name, width, scope, type, format, flags, id); + fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", + __func__, msr_num, path, name, width, scope, type, format, flags, id); switch (scope) { @@ -8448,7 +8449,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.tp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { @@ -8460,7 +8461,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.cp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { @@ -8472,7 +8473,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.pp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { @@ -8617,7 +8618,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char // FIXME: we might not have debug here yet if (debug) - printf("%s: %s/%s, name: %s, scope%d\n", + fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); return 0; -- cgit From b2e4a5dfafcce87ed2b9aac0b2887f421cd2930a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 10 Jul 2024 14:28:03 +0200 Subject: tools/power turbostat: Move verbose counter messages to level 2 Printing information about the source and value during initialization and reading of the counter for each cpu, while useful when debugging, results in too verbose output. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index cccd9f4311a0..86bbea7d8e57 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3797,7 +3797,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3827,7 +3827,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct assert(pi < ARRAY_SIZE(perf_data)); assert(rci->fd_perf != -1); - if (debug) + if (debug >= 2) fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); @@ -3837,7 +3837,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct break; case COUNTER_SOURCE_MSR: - if (debug) + if (debug >= 2) fprintf(stderr, "Reading rapl counter via msr at %u\n", i); assert(!no_msr); @@ -3895,7 +3895,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat struct cstate_counter_info_t *cci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d\n", __func__, cpu); assert(ccstate_counter_info); @@ -3965,9 +3965,8 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat assert(pi < ARRAY_SIZE(perf_data)); assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); - } cci->data[i] = perf_data[pi]; @@ -3979,9 +3978,8 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat if (get_msr(cpu, cci->msr[i], &cci->data[i])) return -13 - i; - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); - } break; } @@ -4036,7 +4034,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) struct msr_counter_info_t *mci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d\n", __func__, cpu); assert(msr_counter_info); @@ -4066,7 +4064,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) assert(pi < ARRAY_SIZE(perf_data)); assert(mci->fd_perf != -1); - if (debug) + if (debug >= 2) fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]); mci->data[i] = perf_data[pi]; @@ -4082,7 +4080,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) mci->data[i] &= mci->msr_mask[i]; - if (debug) + if (debug >= 2) fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]); break; @@ -7125,7 +7123,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct { int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; @@ -7284,7 +7282,7 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st { int ret = add_cstate_perf_counter_(cpu, cci, cai); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; -- cgit From 1f8add13e6c86653da2ec599dfc94e7a8865cdcb Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 3 Jul 2024 23:14:14 +0200 Subject: tools/power turbostat: Add selftests for SMI, APERF and MPERF counters The test requests BICs that are dependent on SMI, APERF and MPERF counters and checks if the columns show up in the output and the turbostat doesn't crash. Read the counters in both --no-msr and --no-perf mode. The test skips counters that are not present or user does not have permissions to read. It is not an error, but the test may not be as exhaustive. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- .../testing/selftests/turbostat/smi_aperf_mperf.py | 157 +++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100755 tools/testing/selftests/turbostat/smi_aperf_mperf.py diff --git a/tools/testing/selftests/turbostat/smi_aperf_mperf.py b/tools/testing/selftests/turbostat/smi_aperf_mperf.py new file mode 100755 index 000000000000..6289cc47d5f0 --- /dev/null +++ b/tools/testing/selftests/turbostat/smi_aperf_mperf.py @@ -0,0 +1,157 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +# CDLL calls dlopen underneath. +# Calling it with None (null), we get handle to the our own image (python interpreter). +# We hope to find sched_getcpu() inside ;] +# This is a bit ugly, but helps shipping working software, so.. +try: + import ctypes + + this_image = ctypes.CDLL(None) + BASE_CPU = this_image.sched_getcpu() +except: + BASE_CPU = 0 # If we fail, set to 0 and pray it's not offline. + +MSR_IA32_MPERF = 0x000000e7 +MSR_IA32_APERF = 0x000000e8 + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + if b'' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + return True + + if not has_perf_counter_access('msr/mperf/'): + return False + if not has_perf_counter_access('msr/aperf/'): + return False + if not has_perf_counter_access('msr/smi/'): + return False + + return True + +def check_msr_access(): + try: + file_msr = open(f'/dev/cpu/{BASE_CPU}/msr', 'rb') + except: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_MPERF)) != 8: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_APERF)) != 8: + return False + + return True + +has_perf_access = check_perf_access() +has_msr_access = check_msr_access() + +turbostat_counter_source_opts = [''] + +if has_msr_access: + turbostat_counter_source_opts.append('--no-perf') +else: + print('SKIP: doesn\'t have MSR access, skipping run with --no-perf') + +if has_perf_access: + turbostat_counter_source_opts.append('--no-msr') +else: + print('SKIP: doesn\'t have perf access, skipping run with --no-msr') + +if not has_msr_access and not has_perf_access: + print('SKIP: No MSR nor perf access detected. Skipping the tests entirely') + exit(0) + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = b'usec\tTime_Of_Day_Seconds\tAPIC\tX2APIC' + +SMI_APERF_MPERF_DEPENDENT_BICS = [ + 'SMI', + 'Avg_MHz', + 'Busy%', + 'Bzy_MHz', +] +if has_perf_access: + SMI_APERF_MPERF_DEPENDENT_BICS.append('IPC') + +for bic in SMI_APERF_MPERF_DEPENDENT_BICS: + for counter_source_opt in turbostat_counter_source_opts: + + # Ugly special case, but it is what it is.. + if counter_source_opt == '--no-perf' and bic == 'IPC': + continue + + expected_columns = bic.encode() + expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + f'\t{bic}'.encode() + + # + # Run turbostat for some time and send SIGINT + # + timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] + turbostat_argv = [turbostat, '-i', '0.50', '--show', bic] + + if counter_source_opt: + turbostat_argv.append(counter_source_opt) + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + print('OK') + + # + # Same, but with --debug + # + turbostat_argv.append('--debug') + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) + print('OK') -- cgit From f3065f9c3917fa9279992623a2f3282f1fd43515 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 4 Jul 2024 20:11:33 +0200 Subject: tools/power turbostat: Add selftests for added perf counters Test adds several perf counters from msr, cstate_core and cstate_pkg groups and checks if the columns for those counters show up. The test skips the counters that are not present. It is not an error, but the test may not be as exhaustive. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- .../selftests/turbostat/added_perf_counters.py | 178 +++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100755 tools/testing/selftests/turbostat/added_perf_counters.py diff --git a/tools/testing/selftests/turbostat/added_perf_counters.py b/tools/testing/selftests/turbostat/added_perf_counters.py new file mode 100755 index 000000000000..9ab4aaf45fb8 --- /dev/null +++ b/tools/testing/selftests/turbostat/added_perf_counters.py @@ -0,0 +1,178 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +class PerfCounterInfo: + def __init__(self, subsys, event): + self.subsys = subsys + self.event = event + + def get_perf_event_name(self): + return f'{self.subsys}/{self.event}/' + + def get_turbostat_perf_id(self, counter_scope, counter_type, column_name): + return f'perf/{self.subsys}/{self.event},{counter_scope},{counter_type},{column_name}' + +PERF_COUNTERS_CANDIDATES = [ + PerfCounterInfo('msr', 'mperf'), + PerfCounterInfo('msr', 'aperf'), + PerfCounterInfo('msr', 'tsc'), + PerfCounterInfo('cstate_core', 'c1-residency'), + PerfCounterInfo('cstate_core', 'c6-residency'), + PerfCounterInfo('cstate_core', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c2-residency'), + PerfCounterInfo('cstate_pkg', 'c3-residency'), + PerfCounterInfo('cstate_pkg', 'c6-residency'), + PerfCounterInfo('cstate_pkg', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c8-residency'), + PerfCounterInfo('cstate_pkg', 'c9-residency'), + PerfCounterInfo('cstate_pkg', 'c10-residency'), +] +present_perf_counters = [] + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + if b'' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + return True + + for counter in PERF_COUNTERS_CANDIDATES: + if has_perf_counter_access(counter.get_perf_event_name()): + present_perf_counters.append(counter) + + if len(present_perf_counters) == 0: + print('SKIP: Could not read any perf counter.') + return False + + if len(present_perf_counters) != len(PERF_COUNTERS_CANDIDATES): + print(f'WARN: Could not access all of the counters - some will be left untested') + + return True + +if not check_perf_access(): + exit(0) + +turbostat_counter_source_opts = [''] + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = [b'usec', b'Time_Of_Day_Seconds', b'APIC', b'X2APIC'] + +expected_columns = [b'CPU'] +counters_argv = [] +for counter in present_perf_counters: + if counter.subsys == 'cstate_core': + counter_scope = 'core' + elif counter.subsys == 'cstate_pkg': + counter_scope = 'package' + else: + counter_scope = 'cpu' + + counter_type = 'delta' + column_name = counter.event + + cparams = counter.get_turbostat_perf_id( + counter_scope = counter_scope, + counter_type = counter_type, + column_name = column_name + ) + expected_columns.append(column_name.encode()) + counters_argv.extend(['--add', cparams]) + +expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + expected_columns + +def gen_user_friendly_cmdline(argv_): + argv = argv_[:] + ret = '' + + while len(argv) != 0: + arg = argv.pop(0) + arg_next = '' + + if arg in ('-i', '--show', '--add'): + arg_next = argv.pop(0) if len(argv) > 0 else '' + + ret += f'{arg} {arg_next} \\\n\t' + + # Remove the last separator and return + return ret[:-4] + +# +# Run turbostat for some time and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] +turbostat_argv = [turbostat, '-i', '0.50', '--show', 'CPU'] + counters_argv + +def check_columns_or_fail(expected_columns: list, actual_columns: list): + if len(actual_columns) != len(expected_columns): + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + + failed = False + for expected_column in expected_columns: + if expected_column not in actual_columns: + print(f'turbostat column check failed: missing column {expected_column.decode()}') + failed = True + + if failed: + exit(1) + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns, actual_columns) +print('OK') + +# +# Same, but with --debug +# +# We explicitly specify '--show CPU' to make sure turbostat +# don't show a bunch of default counters instead. +# +turbostat_argv.append('--debug') + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat (in debug mode) with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns_debug, actual_columns) +print('OK') -- cgit From 4c17736689ccfc44ec7dcc472577f25c34cf8724 Mon Sep 17 00:00:00 2001 From: Casey Chen Date: Mon, 22 Jul 2024 15:15:48 -0600 Subject: perf tool: fix dereferencing NULL al->maps With 0dd5041c9a0e ("perf addr_location: Add init/exit/copy functions"), when cpumode is 3 (macro PERF_RECORD_MISC_HYPERVISOR), thread__find_map() could return with al->maps being NULL. The path below could add a callchain_cursor_node with NULL ms.maps. add_callchain_ip() thread__find_symbol(.., &al) thread__find_map(.., &al) // al->maps becomes NULL ms.maps = maps__get(al.maps) callchain_cursor_append(..., &ms, ...) node->ms.maps = maps__get(ms->maps) Then the path below would dereference NULL maps and get segfault. fill_callchain_info() maps__machine(node->ms.maps); Fix it by checking if maps is NULL in fill_callchain_info(). Fixes: 0dd5041c9a0e ("perf addr_location: Add init/exit/copy functions") Signed-off-by: Casey Chen Reviewed-by: Ian Rogers Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Namhyung Kim Cc: yzhong@purestorage.com Link: https://lore.kernel.org/r/20240722211548.61455-1-cachen@purestorage.com Signed-off-by: Namhyung Kim --- tools/perf/util/callchain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 1730b852a947..6d075648d2cc 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1141,7 +1141,7 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node, bool hide_unresolved) { - struct machine *machine = maps__machine(node->ms.maps); + struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL; maps__put(al->maps); al->maps = maps__get(node->ms.maps); -- cgit From 440cf77625e300e683ca0edc39fbc4b6f3175feb Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:06 +0100 Subject: perf: build: Setup PKG_CONFIG_LIBDIR for cross compilation On recent Linux distros like Ubuntu Noble and Debian Bookworm, the 'pkg-config-aarch64-linux-gnu' package is missing. As a result, the aarch64-linux-gnu-pkg-config command is not available, which causes build failures. When a build passes the environment variables PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH, like a user uses make command or a build system (like Yocto, Buildroot, etc) prepares the variables and passes to the Perf's Makefile, the commit keeps these variables for package configuration. Otherwise, this commit sets the PKG_CONFIG_LIBDIR variable to use the Multiarch libs for the cross compilation. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-2-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 25 ++++++++++++++++++++++++- tools/perf/Makefile.perf | 27 ++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index ed54cef450f5..dff65d03d30d 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -82,7 +82,30 @@ FILES= \ FILES := $(addprefix $(OUTPUT),$(FILES)) -PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +# Some distros provide the command $(CROSS_COMPILE)pkg-config for +# searching packges installed with Multiarch. Use it for cross +# compilation if it is existed. +ifneq (, $(shell which $(CROSS_COMPILE)pkg-config)) + PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +else + PKG_CONFIG ?= pkg-config + + # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR + # for modified system root, are required for the cross compilation. + # If these PKG_CONFIG environment variables are not set, Multiarch library + # paths are used instead. + ifdef CROSS_COMPILE + ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) + CROSS_ARCH = $(shell $(CC) -dumpmachine) + PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/ + export PKG_CONFIG_LIBDIR + endif + endif +endif all: $(FILES) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 175e4c7898f0..f8148db5fc38 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -193,7 +193,32 @@ HOSTLD ?= ld HOSTAR ?= ar CLANG ?= clang -PKG_CONFIG = $(CROSS_COMPILE)pkg-config +# Some distros provide the command $(CROSS_COMPILE)pkg-config for +# searching packges installed with Multiarch. Use it for cross +# compilation if it is existed. +ifneq (, $(shell which $(CROSS_COMPILE)pkg-config)) + PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config +else + PKG_CONFIG ?= pkg-config + + # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR + # for modified system root, is required for the cross compilation. + # If these PKG_CONFIG environment variables are not set, Multiarch library + # paths are used instead. + ifdef CROSS_COMPILE + ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) + CROSS_ARCH = $(shell $(CC) -dumpmachine) + PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/ + PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/ + export PKG_CONFIG_LIBDIR + $(warning Missing PKG_CONFIG_LIBDIR, PKG_CONFIG_PATH and PKG_CONFIG_SYSROOT_DIR for cross compilation,) + $(warning set PKG_CONFIG_LIBDIR for using Multiarch libs.) + endif + endif +endif RM = rm -f LN = ln -f -- cgit From cffe29d3b54aa0437bc35440ea64866bbfc418a3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:07 +0100 Subject: perf: build: Set Python configuration for cross compilation Python configuration has dedicated folders for different architectures. For example, Python 3.11 has two folders as shown below, one for Arm64 and another for x86_64: /usr/lib/python3.11/config-3.11-aarch64-linux-gnu/ /usr/lib/python3.11/config-3.11-x86_64-linux-gnu/ This commit updates the Python configuration path based on the compiler's machine type, guiding the compiler to find the correct path for Python libraries. It also renames the generated .so file name to match the machine name. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-3-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/perf/Makefile.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index c896babf7a74..524a68032bdb 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -296,6 +296,11 @@ endif ifdef PYTHON_CONFIG PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) $(PYTHON_CONFIG_LDFLAGS) 2>/dev/null) + # Update the python flags for cross compilation + ifdef CROSS_COMPILE + PYTHON_NATIVE := $(shell echo $(PYTHON_EMBED_LDOPTS) | sed 's/\(-L.*\/\)\(.*-linux-gnu\).*/\2/') + PYTHON_EMBED_LDOPTS := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EMBED_LDOPTS)) + endif PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --includes 2>/dev/null) @@ -897,6 +902,9 @@ else PYTHON_SETUPTOOLS_INSTALLED := $(shell $(PYTHON) -c 'import setuptools;' 2> /dev/null && echo "yes" || echo "no") ifeq ($(PYTHON_SETUPTOOLS_INSTALLED), yes) PYTHON_EXTENSION_SUFFIX := $(shell $(PYTHON) -c 'from importlib import machinery; print(machinery.EXTENSION_SUFFIXES[0])') + ifdef CROSS_COMPILE + PYTHON_EXTENSION_SUFFIX := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EXTENSION_SUFFIX)) + endif LANG_BINDINGS += $(obj-perf)python/perf$(PYTHON_EXTENSION_SUFFIX) else $(warning Missing python setuptools, the python binding won't be built, please install python3-setuptools or equivalent) -- cgit From 536661da6ea18fe6df5740bc9e9001d097b035ee Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:08 +0100 Subject: perf: build: Only link libebl.a for old libdw Since libdw version 0.177, elfutils has merged libebl.a into libdw (see the commit "libebl: Don't install libebl.a, libebl.h and remove backends from spec." in the elfutils repository). As a result, libebl.a does not exist on Debian Bullseye and newer releases, causing static perf builds to fail on these distributions. This commit checks the libdw version and only links libebl.a if it detects that the libdw version is older than 0.177. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-4-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 12 +++++++++++- tools/perf/Makefile.config | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index dff65d03d30d..08b2be257639 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -170,7 +170,17 @@ $(OUTPUT)test-libopencsd.bin: DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) -DWARFLIBS += -lelf -lebl -lz -llzma -lbz2 + DWARFLIBS += -lelf -lz -llzma -lbz2 + + LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) + LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) + LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION))) + + # Elfutils merged libebl.a into libdw.a starting from version 0.177, + # Link libebl.a only if libdw is older than this version. + ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0) + DWARFLIBS += -lebl + endif endif $(OUTPUT)test-dwarf.bin: diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 524a68032bdb..7cf0a39bc200 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -152,7 +152,17 @@ ifdef LIBDW_DIR endif DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -lebl -ldl -lz -llzma -lbz2 + DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 + + LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) + LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) + LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION))) + + # Elfutils merged libebl.a into libdw.a starting from version 0.177, + # Link libebl.a only if libdw is older than this version. + ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0) + DWARFLIBS += -lebl + endif endif FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS) FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) $(DWARFLIBS) -- cgit From 91b6a536b40658c24d6f04747ddf852d30f7f259 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:09 +0100 Subject: perf: build: Link lib 'lzma' for static build The libunwind feature test failed with the static linkage. This is due to the 'lzma' lib is missed, so link it to dismiss building failure. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-5-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 08b2be257639..2d5be5c17d65 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -211,27 +211,27 @@ $(OUTPUT)test-numa_num_possible_cpus.bin: $(BUILD) -lnuma $(OUTPUT)test-libunwind.bin: - $(BUILD) -lelf + $(BUILD) -lelf -llzma $(OUTPUT)test-libunwind-debug-frame.bin: - $(BUILD) -lelf + $(BUILD) -lelf -llzma $(OUTPUT)test-libunwind-x86.bin: - $(BUILD) -lelf -lunwind-x86 + $(BUILD) -lelf -llzma -lunwind-x86 $(OUTPUT)test-libunwind-x86_64.bin: - $(BUILD) -lelf -lunwind-x86_64 + $(BUILD) -lelf -llzma -lunwind-x86_64 $(OUTPUT)test-libunwind-arm.bin: - $(BUILD) -lelf -lunwind-arm + $(BUILD) -lelf -llzma -lunwind-arm $(OUTPUT)test-libunwind-aarch64.bin: - $(BUILD) -lelf -lunwind-aarch64 + $(BUILD) -lelf -llzma -lunwind-aarch64 $(OUTPUT)test-libunwind-debug-frame-arm.bin: - $(BUILD) -lelf -lunwind-arm + $(BUILD) -lelf -llzma -lunwind-arm $(OUTPUT)test-libunwind-debug-frame-aarch64.bin: - $(BUILD) -lelf -lunwind-aarch64 + $(BUILD) -lelf -llzma -lunwind-aarch64 $(OUTPUT)test-libaudit.bin: $(BUILD) -laudit -- cgit From f42596c73872b753ff1799bc7fc79b1100226da1 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:10 +0100 Subject: perf: build: Link lib 'zstd' for static build When build static perf, Makefile reports the error: Makefile.config:480: No libdw DWARF unwind found, Please install elfutils-devel/libdw-dev >= 0.158 and/or set LIBDW_DIR The libdw has been installed on the system, but the build system fails to build the feature detecting binary 'test-libdw-dwarf-unwind'. The failure is caused by missing to link the lib 'zstd'. Link lib 'zstd' for the static build, in the end, the dwarf feature can be enabled in the static perf. Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: James Clark Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-6-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/build/feature/Makefile | 2 +- tools/perf/Makefile.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 2d5be5c17d65..4af6a9582f15 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -170,7 +170,7 @@ $(OUTPUT)test-libopencsd.bin: DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -lz -llzma -lbz2 + DWARFLIBS += -lelf -lz -llzma -lbz2 -lzstd LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 7cf0a39bc200..fa679db61f62 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -152,7 +152,7 @@ ifdef LIBDW_DIR endif DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) - DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 + DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 -lzstd LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw) LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION))) -- cgit From d27087c76e3c859ea05b7581ef7ce8aa5a088dd8 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Jul 2024 09:22:11 +0100 Subject: perf docs: Document cross compilation Records the commands for cross compilation with two methods. The first method relies on Multiarch. The second approach is to explicitly specify the PKG_CONFIG variables, which is widely used in build system (like Buildroot, Yocto, etc). Co-developed-by: James Clark Signed-off-by: James Clark Signed-off-by: Leo Yan Tested-by: Ian Rogers Cc: amadio@gentoo.org Cc: Thomas Richter Link: https://lore.kernel.org/r/20240717082211.524826-7-leo.yan@arm.com Signed-off-by: Namhyung Kim --- tools/perf/Documentation/Build.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt index 3766886c4bca..83dc87c662b6 100644 --- a/tools/perf/Documentation/Build.txt +++ b/tools/perf/Documentation/Build.txt @@ -71,3 +71,31 @@ supported by GCC. UBSan detects undefined behaviors of programs at runtime. $ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a If UBSan detects any problem at runtime, it outputs a “runtime error:” message. + +4) Cross compilation +==================== +As Multiarch is commonly supported in Linux distributions, we can install +libraries for multiple architectures on the same system and then cross-compile +Linux perf. For example, Aarch64 libraries and toolchains can be installed on +an x86_64 machine, allowing us to compile perf for an Aarch64 target. + +Below is the command for building the perf with dynamic linking. + + $ cd /path/to/Linux + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +For static linking, the option `LDFLAGS="-static"` is required. + + $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \ + LDFLAGS="-static" -C tools/perf + +In the embedded system world, a use case is to explicitly specify the package +configuration paths for cross building: + + $ PKG_CONFIG_SYSROOT_DIR="/path/to/cross/build/sysroot" \ + PKG_CONFIG_LIBDIR="/usr/lib/:/usr/local/lib" \ + make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf + +In this case, the variable PKG_CONFIG_SYSROOT_DIR can be used alongside the +variable PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH to prepend the sysroot path to +the library paths for cross compilation. -- cgit From f0e4ed752fda6997b41917c94a5478b340178001 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jul 2024 22:11:03 +0200 Subject: tools/power turbostat: Add early support for PMT counters Allows users to read Intel PMT (Platform Monitoring Technology) counters, providing interface similar to one used to add MSR and perf counters. Because PMT is exposed as a raw MMIO range, without metadata, user has to supply the necessary information to find and correctly display the requested counter. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 768 +++++++++++++++++++++++++++++++++- 1 file changed, 766 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 86bbea7d8e57..c9fdfb074ce9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -1082,6 +1083,9 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 #define MAX_ADDED_PACKAGE_COUNTERS 16 +#define PMT_MAX_ADDED_THREAD_COUNTERS 24 +#define PMT_MAX_ADDED_CORE_COUNTERS 8 +#define PMT_MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 #define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr)) @@ -1466,6 +1470,100 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { }, }; +/* Can be redefined when compiling, useful for testing. */ +#ifndef SYSFS_TELEM_PATH +#define SYSFS_TELEM_PATH "/sys/class/intel_pmt" +#endif + +#define PMT_COUNTER_NAME_SIZE_BYTES 16 +#define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 + +struct pmt_mmio { + struct pmt_mmio *next; + + unsigned int guid; + unsigned int size; + + /* Base pointer to the mmaped memory. */ + void *mmio_base; + + /* + * Offset to be applied to the mmio_base + * to get the beginning of the PMT counters for given GUID. + */ + unsigned long pmt_offset; +} *pmt_mmios; + +enum pmt_datatype { + PMT_TYPE_RAW, +}; + +struct pmt_domain_info { + /* + * Pointer to the MMIO obtained by applying a counter offset + * to the mmio_base of the mmaped region for the given GUID. + * + * This is where to read the raw value of the counter from. + */ + unsigned long *pcounter; +}; + +struct pmt_counter { + struct pmt_counter *next; + + /* PMT metadata */ + char name[PMT_COUNTER_NAME_SIZE_BYTES]; + enum pmt_datatype type; + enum counter_scope scope; + unsigned int lsb; + unsigned int msb; + + /* BIC-like metadata */ + enum counter_format format; + + unsigned int num_domains; + struct pmt_domain_info *domains; +}; + +unsigned int pmt_counter_get_width(const struct pmt_counter *p) +{ + return (p->msb - p->lsb)+1; +} + +void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size) +{ + struct pmt_domain_info *new_mem; + + new_mem = (struct pmt_domain_info*) reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); + if (!new_mem) { + fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__); + exit(1); + } + + /* Zero initialize just allocated memory. */ + const size_t num_new_domains = new_size - pcounter->num_domains; + + memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains)); + + pcounter->num_domains = new_size; + pcounter->domains = new_mem; +} + +void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) +{ + /* + * Allocate more memory ahead of time. + * + * Always allocate space for at least 8 elements + * and double the size when growing. + */ + if (new_size < 8) + new_size = 8; + new_size = MAX(new_size, pcounter->num_domains*2); + + pmt_counter_resize_(pcounter, new_size); +} + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1484,6 +1582,7 @@ struct thread_data { bool is_atom; unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { @@ -1498,6 +1597,7 @@ struct core_data { unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1532,6 +1632,7 @@ struct pkg_data { unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1681,6 +1782,10 @@ struct sys_counters { struct perf_counter_info *perf_tp; struct perf_counter_info *perf_cp; struct perf_counter_info *perf_pp; + + struct pmt_counter *pmt_tp; + struct pmt_counter *pmt_cp; + struct pmt_counter *pmt_pp; } sys; static size_t free_msr_counters_(struct msr_counter **pp) @@ -1981,6 +2086,7 @@ void help(void) " -a, --add add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" + " eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" @@ -2092,6 +2198,7 @@ void print_header(char *delim) { struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; int printed = 0; if (DO_BIC(BIC_USEC)) @@ -2164,6 +2271,21 @@ void print_header(char *delim) } } + ppmt = sys.pmt_tp; + while (ppmt) { + switch(ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_c3)) @@ -2219,6 +2341,21 @@ void print_header(char *delim) } } + ppmt = sys.pmt_cp; + while (ppmt) { + switch(ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2329,6 +2466,21 @@ void print_header(char *delim) } } + ppmt = sys.pmt_pp; + while (ppmt) { + switch(ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + } + + ppmt = ppmt->next; + } + outp += sprintf(outp, "\n"); } @@ -2450,6 +2602,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; char *delim = "\t"; int printed = 0; @@ -2612,6 +2765,19 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); + + break; + } + } + /* C1 */ if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); @@ -2673,6 +2839,19 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); + + break; + } + } + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) @@ -2842,9 +3021,23 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); } else if (pp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); - } else if (pp->type == COUNTER_K2M) + } else if (pp->type == COUNTER_K2M) { outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + } + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); + + break; + } } done: @@ -2901,6 +3094,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; @@ -2970,6 +3164,13 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -2978,6 +3179,7 @@ void delta_core(struct core_data *new, struct core_data *old) int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; @@ -3001,6 +3203,13 @@ void delta_core(struct core_data *new, struct core_data *old) else old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } } int soft_c1_residency_display(int bic) @@ -3019,6 +3228,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* we run cpuid just the 1st time, copy the results */ if (DO_BIC(BIC_APIC)) @@ -3105,6 +3315,13 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -3211,6 +3428,10 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data memset(&t->perf_counter[0], 0, sizeof(t->perf_counter)); memset(&c->perf_counter[0], 0, sizeof(c->perf_counter)); memset(&p->perf_counter[0], 0, sizeof(p->perf_counter)); + + memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter)); + memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter)); + memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter)); } void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) @@ -3232,6 +3453,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) @@ -3268,6 +3490,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.perf_counter[i] += t->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] += t->pmt_counter[i]; + } + /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3294,6 +3520,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.perf_counter[i] += c->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] += c->pmt_counter[i]; + } + /* sum per-pkg values only for 1st core in pkg */ if (!is_cpu_first_core_in_package(t, c, p)) return 0; @@ -3353,6 +3583,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.perf_counter[i] += p->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] += p->pmt_counter[i]; + } + return 0; } @@ -3365,6 +3599,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; clear_counters(&average.threads, &average.cores, &average.packages); @@ -3465,6 +3700,16 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data } average.packages.perf_counter[i] /= topo.allowed_packages; } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] /= topo.allowed_cpus; + } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] /= topo.allowed_cores; + } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] /= topo.allowed_packages; + } } static unsigned long long rdtsc(void) @@ -4120,6 +4365,32 @@ int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigne return 0; } +unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) +{ + unsigned long mask; + + if (msb == 63) + mask = 0xffffffffffffffff; + else + mask = ((1<<(msb+1))-1); + + mask -= (1<num_domains); + + const unsigned long *pmmio = ppmt->domains[domain_id].pcounter; + const unsigned long value = pmmio ? *pmmio : 0; + const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb); + const unsigned long value_shift = ppmt->lsb; + + return (value & value_mask) >> value_shift; +} + /* * get_counters(...) * migrate to cpu @@ -4130,6 +4401,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int cpu = t->cpu_id; unsigned long long msr; struct msr_counter *mp; + struct pmt_counter *pp; int i; int status; @@ -4164,6 +4436,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS)) return -10; + for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next) + t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id); + /* collect core counters only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) goto done; @@ -4205,6 +4480,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS)) return -10; + for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next) + c->pmt_counter[i] = pmt_read_counter(pp, c->core_id); + /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) goto done; @@ -4281,6 +4559,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS)) return -10; + for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next) + p->pmt_counter[i] = pmt_read_counter(pp, p->package_id); + done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -8285,6 +8566,304 @@ void added_perf_counters_init(void) errx(1, "%s: %s", __func__, "package"); } +int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output) +{ + int fd_telem_info; + FILE *file_telem_info; + unsigned long value; + + fd_telem_info = openat(fd_dir, info_filename, O_RDONLY); + if (fd_telem_info == -1) + return -1; + + file_telem_info = fdopen(fd_telem_info, "r"); + if (file_telem_info == NULL) { + close(fd_telem_info); + return -1; + } + + if (fscanf(file_telem_info, format, &value) != 1) { + fclose(file_telem_info); + return -1; + } + + fclose(file_telem_info); + + *output = value; + + return 0; +} + +struct pmt_mmio* pmt_mmio_open(unsigned int target_guid) +{ + DIR *dirp; + struct dirent *entry; + struct stat st; + unsigned int telem_idx; + int fd_telem_dir, fd_pmt; + unsigned long guid, size, offset; + size_t mmap_size; + void *mmio; + struct pmt_mmio *ret = NULL; + + if (stat(SYSFS_TELEM_PATH, &st) == -1) + return NULL; + + dirp = opendir(SYSFS_TELEM_PATH); + if (dirp == NULL) + return NULL; + + for (;;) { + entry = readdir(dirp); + + if (entry == NULL) + break; + + if (strcmp(entry->d_name, ".") == 0) + continue; + + if (strcmp(entry->d_name, "..") == 0) + continue; + + if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1) + continue; + + if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) { + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY); + if (fd_telem_dir == -1) { + break; + } + + if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { + close(fd_telem_dir); + break; + } + + if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) { + close(fd_telem_dir); + break; + } + + if (guid != target_guid) { + close(fd_telem_dir); + continue; + } + + if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) { + close(fd_telem_dir); + break; + } + + assert(offset == 0); + + fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY); + if (fd_pmt == -1) + goto loop_cleanup_and_break; + + mmap_size = (size + 0x1000UL) & (~0x1000UL); + mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0); + if (mmio != MAP_FAILED) { + + if (debug) + fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio); + + ret = calloc(1, sizeof(*ret)); + + if (!ret) { + fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__); + exit(1); + } + + ret->guid = guid; + ret->mmio_base = mmio; + ret->pmt_offset = offset; + ret->size = size; + + ret->next = pmt_mmios; + pmt_mmios = ret; + } + +loop_cleanup_and_break: + close(fd_pmt); + close(fd_telem_dir); + break; + } + + closedir(dirp); + + return ret; +} + +struct pmt_mmio* pmt_mmio_find(unsigned int guid) +{ + struct pmt_mmio *pmmio = pmt_mmios; + + while (pmmio) { + if (pmmio->guid == guid) + return pmmio; + + pmmio = pmmio->next; + } + + return NULL; +} + +void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) +{ + char *ret; + + /* Get base of mmaped PMT file. */ + ret = (char*)pmmio->mmio_base; + + /* + * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data. + * It's not guaranteed that the mmaped memory begins with the telemetry data + * - we might have to apply the offset first. + */ + ret += pmmio->pmt_offset; + + /* Apply the counter offset to get the address to the mmaped counter. */ + ret += counter_offset; + + return ret; +} + +struct pmt_mmio* pmt_add_guid(unsigned int guid) +{ + struct pmt_mmio *ret; + + ret = pmt_mmio_find(guid); + if (!ret) + ret = pmt_mmio_open(guid); + + return ret; +} + +enum pmt_open_mode { + PMT_OPEN_TRY, /* Open failure is not an error. */ + PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */ +}; + +struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *name) +{ + while (pcounter) { + if (strcmp(pcounter->name, name) == 0) + break; + + pcounter = pcounter->next; + } + + return pcounter; +} + +struct pmt_counter** pmt_get_scope_root(enum counter_scope scope) +{ + switch(scope) { + case SCOPE_CPU: + return &sys.pmt_tp; + case SCOPE_CORE: + return &sys.pmt_cp; + case SCOPE_PACKAGE: + return &sys.pmt_pp; + } + + __builtin_unreachable(); +} + +void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id) +{ + /* Make sure the new domain fits. */ + if (domain_id >= pcounter->num_domains) + pmt_counter_resize(pcounter, domain_id+1); + + assert(pcounter->domains); + assert(domain_id < pcounter->num_domains); + + pcounter->domains[domain_id].pcounter = pmmio; +} + +int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, + unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, + enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) +{ + struct pmt_mmio *mmio; + struct pmt_counter *pcounter; + struct pmt_counter ** const pmt_root = pmt_get_scope_root(scope); + bool new_counter = false; + int conflict = 0; + + if (lsb > msb) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name); + exit(1); + } + + if (msb >= 64) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name); + exit(1); + } + + mmio = pmt_add_guid(guid); + if (!mmio) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid); + exit(1); + } + + return 1; + } + + if (offset >= mmio->size) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size); + exit(1); + } + + return 1; + } + + pcounter = pmt_find_counter(*pmt_root, name); + if (!pcounter) { + pcounter = calloc(1, sizeof(*pcounter)); + new_counter = true; + } + + if (new_counter) { + strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name)-1); + pcounter->type = type; + pcounter->scope = scope; + pcounter->lsb = lsb; + pcounter->msb = msb; + pcounter->format = format; + } else { + conflict += pcounter->type != type; + conflict += pcounter->scope != scope; + conflict += pcounter->lsb != lsb; + conflict += pcounter->msb != msb; + conflict += pcounter->format != format; + } + + if (conflict) { + fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n", + __func__, name); + exit(1); + } + + pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id); + + if (new_counter) { + pcounter->next = *pmt_root; + *pmt_root = pcounter; + } + + return 0; +} + void turbostat_init() { setup_all_buffers(true); @@ -8622,7 +9201,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char return 0; } -void parse_add_command(char *add_command) +void parse_add_command_msr(char *add_command) { int msr_num = 0; char *path = NULL; @@ -8747,6 +9326,191 @@ next: } } +bool starts_with(const char *str, const char *prefix) +{ + return strncmp(prefix, str, strlen(prefix)) == 0; +} + +void parse_add_command_pmt(char *add_command) +{ + char *name = NULL; + char *type_name = NULL; + char *format_name = NULL; + unsigned int offset; + unsigned int lsb; + unsigned int msb; + unsigned int guid; + unsigned int domain_id; + enum counter_scope scope = 0; + enum pmt_datatype type = PMT_TYPE_RAW; + enum counter_format format = FORMAT_RAW; + bool has_offset = false; + bool has_lsb = false; + bool has_msb = false; + bool has_format = true; /* Format has a default value. */ + bool has_guid = false; + bool has_scope = false; + bool has_type = true; /* Type has a default value. */ + + /* Consume the "pmt," prefix. */ + add_command = strchr(add_command, ','); + if (!add_command) { + help(); + exit(1); + } + ++add_command; + + while (add_command) { + if (starts_with(add_command, "name=")) { + name = add_command + strlen("name="); + goto next; + } + + if (starts_with(add_command, "type=")) { + type_name = add_command + strlen("type="); + goto next; + } + + if (starts_with(add_command, "domain=")) { + const size_t prefix_len = strlen("domain="); + + if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) { + scope = SCOPE_CPU; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) { + scope = SCOPE_CORE; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) { + scope = SCOPE_PACKAGE; + has_scope = true; + } + + if (!has_scope) { + printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", + __func__); + exit(1); + } + + goto next; + } + + if (starts_with(add_command, "format=")) { + format_name = add_command + strlen("format="); + goto next; + } + + if (sscanf(add_command, "offset=%u", &offset) == 1) { + has_offset = true; + goto next; + } + + if (sscanf(add_command, "lsb=%u", &lsb) == 1) { + has_lsb = true; + goto next; + } + + if (sscanf(add_command, "msb=%u", &msb) == 1) { + has_msb = true; + goto next; + } + + if (sscanf(add_command, "guid=%x", &guid) == 1) { + has_guid = true; + goto next; + } + +next: + add_command = strchr(add_command, ','); + if (add_command) { + *add_command = '\0'; + add_command++; + } + } + + if (!name) { + printf("%s: missing %s\n", __func__, "name"); + exit(1); + } + + if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) { + printf("%s: name has to be at most %d characters long\n", + __func__, PMT_COUNTER_NAME_SIZE_BYTES); + exit(1); + } + + if (format_name) { + has_format = false; + + if (strcmp("raw", format_name) == 0) { + format = FORMAT_RAW; + has_format = true; + } + + if (strcmp("delta", format_name) == 0) { + format = FORMAT_DELTA; + has_format = true; + } + + if (!has_format) { + fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name); + exit(1); + } + } + + if (type_name) { + has_type = false; + + if (strcmp("raw", type_name) == 0) { + type = PMT_TYPE_RAW; + has_type = true; + } + + if (!has_type) { + printf("%s: invalid %s: %s\n", __func__, "type", type_name); + exit(1); + } + } + + if (!has_offset) { + printf("%s : missing %s\n", __func__, "offset"); + exit(1); + } + + if (!has_lsb) { + printf("%s: missing %s\n", __func__, "lsb"); + exit(1); + } + + if (!has_msb) { + printf("%s: missing %s\n", __func__, "msb"); + exit(1); + } + + if (!has_guid) { + printf("%s: missing %s\n", __func__, "guid"); + exit(1); + } + + if (!has_scope) { + printf("%s: missing %s\n", __func__, "scope"); + exit(1); + } + + if (lsb > msb) { + printf("%s: lsb > msb doesn't make sense\n", __func__); + exit(1); + } + + pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED); +} + +void parse_add_command(char *add_command) +{ + if (strncmp(add_command, "pmt", strlen("pmt")) == 0) + return parse_add_command_pmt(add_command); + return parse_add_command_msr(add_command); +} + int is_deferred_add(char *name) { int i; -- cgit From 640540beb88363a825524295664acfdb0f5d5fc2 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jul 2024 22:12:22 +0200 Subject: tools/power turbostat: Add MTL's PMT DC6 builtin counter Provide a definition for metadata that allows reading DC6 residency counter via PMT and exposes it as a builtin counter. Note that this residency counter is updated and read via entirely different mechanisms vs the MSR-based residency counters. On MTL processors, there are times when Die%c6 will report above 100%. This is still useful, but don't expect 3 digits of precision... Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 70 ++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c9fdfb074ce9..f769e8beabd3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -193,6 +193,7 @@ struct msr_counter bic[] = { { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -254,11 +255,12 @@ struct msr_counter bic[] = { #define BIC_SAM_mc6 (1ULL << 55) #define BIC_SAMMHz (1ULL << 56) #define BIC_SAMACTMHz (1ULL << 57) +#define BIC_Diec6 (1ULL << 58) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -1475,6 +1477,11 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { #define SYSFS_TELEM_PATH "/sys/class/intel_pmt" #endif +#define PMT_COUNTER_MTL_DC6_OFFSET 120 +#define PMT_COUNTER_MTL_DC6_LSB 0 +#define PMT_COUNTER_MTL_DC6_MSB 63 +#define PMT_MTL_DC6_GUID 0x1a067102 + #define PMT_COUNTER_NAME_SIZE_BYTES 16 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 @@ -1496,6 +1503,7 @@ struct pmt_mmio { enum pmt_datatype { PMT_TYPE_RAW, + PMT_TYPE_XTAL_TIME, }; struct pmt_domain_info { @@ -1630,6 +1638,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; + unsigned long long die_c6; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; @@ -2281,6 +2290,10 @@ void print_header(char *delim) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; } ppmt = ppmt->next; @@ -2351,6 +2364,10 @@ void print_header(char *delim) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; } ppmt = ppmt->next; @@ -2400,6 +2417,8 @@ void print_header(char *delim) outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : "")); if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : "")); + if (DO_BIC(BIC_Diec6)) + outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_LPI)) outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : "")); if (DO_BIC(BIC_SYS_LPI)) @@ -2476,6 +2495,10 @@ void print_header(char *delim) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; } ppmt = ppmt->next; @@ -2775,6 +2798,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = t->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; } } @@ -2849,6 +2879,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = c->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; } } @@ -2931,6 +2968,9 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); + if (DO_BIC(BIC_Diec6)) + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) outp += @@ -3037,6 +3077,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = p->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; } } @@ -3115,6 +3162,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->pc8 = new->pc8 - old->pc8; old->pc9 = new->pc9 - old->pc9; old->pc10 = new->pc10 - old->pc10; + old->die_c6 = new->die_c6 - old->die_c6; old->cpu_lpi = new->cpu_lpi - old->cpu_lpi; old->sys_lpi = new->sys_lpi - old->sys_lpi; old->pkg_temp_c = new->pkg_temp_c; @@ -3398,6 +3446,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->pc8 = 0; p->pc9 = 0; p->pc10 = 0; + p->die_c6 = 0; p->cpu_lpi = 0; p->sys_lpi = 0; @@ -3547,6 +3596,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pc8 += p->pc8; average.packages.pc9 += p->pc9; average.packages.pc10 += p->pc10; + average.packages.die_c6 += p->die_c6; average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; @@ -3642,6 +3692,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data average.packages.pc8 /= topo.allowed_packages; average.packages.pc9 /= topo.allowed_packages; average.packages.pc10 /= topo.allowed_packages; + average.packages.die_c6 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -5505,6 +5556,7 @@ void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); void added_perf_counters_init(void); +void pmt_init(void); void re_initialize(void) { @@ -5515,6 +5567,7 @@ void re_initialize(void) rapl_perf_init(); cstate_perf_init(); added_perf_counters_init(); + pmt_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -8864,6 +8917,15 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, return 0; } +void pmt_init(void) +{ + if (BIC_IS_ENABLED(BIC_Diec6)) { + pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB, + PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, + 0, PMT_OPEN_TRY); + } +} + void turbostat_init() { setup_all_buffers(true); @@ -8878,6 +8940,7 @@ void turbostat_init() rapl_perf_init(); cstate_perf_init(); added_perf_counters_init(); + pmt_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -9465,6 +9528,11 @@ next: has_type = true; } + if (strcmp("txtal_time", type_name) == 0) { + type = PMT_TYPE_XTAL_TIME; + has_type = true; + } + if (!has_type) { printf("%s: invalid %s: %s\n", __func__, "type", type_name); exit(1); -- cgit From 944264a2a99cc4dd4b10eaa9798b6aab80adf4be Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 23 Jul 2024 20:12:32 +0200 Subject: tools/power turbostat: Document PMT in turbostat.8 Add a general description of the user interface for adding PMT counters with the new --add pmt,... option. Provide a complete example for requesting two counters. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 65 +++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 44cbc7cb382d..067717bce1d4 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -55,6 +55,39 @@ name as necessary to disambiguate it from others is necessary. Note that option as the column header. .fi .PP +\fB--add pmt,[attr_name=attr_value, ...]\fP add column with a PMT (Intel Platform Monitoring Technology) counter in a similar way to --add option above, but require PMT metadata to be supplied to correctly read and display the counter. The metadata can be found in the Intel PMT XML files, hosted at https://github.com/intel/Intel-PMT. For a complete example see "ADD PMT COUNTER EXAMPLE". +.nf + name="name_string" + For column header. + + type={\fBraw\fP} + 'raw' shows the counter contents in hex. + default: raw + + format={\fBraw\fP | \fBdelta\fP} + 'raw' shows the counter contents in hex. + 'delta' shows the difference in values during the measurement interval. + default: raw + + domain={\fBcpu%u\fP | \fBcore%u\fP | \fBpackage%u\fP} + 'cpu' per cpu/thread counter. + 'core' per core counter. + 'package' per package counter. + '%u' denotes id of the domain that the counter is associated with. For example core4 would mean that the counter is associated with core number 4. + + offset=\fB%u\fP + '%u' offset within the PMT MMIO region. + + lsb=\fB%u\fP + '%u' least significant bit within the 64 bit value read from 'offset'. Together with 'msb', used to form a read mask. + + msb=\fB%u\fP + '%u' most significant bit within the 64 bit value read from 'offset'. Together with 'lsb', used to form a read mask. + + guid=\fB%x\fP + '%x' hex identifier of the PMT MMIO region. +.fi +.PP \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44 .PP \fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. @@ -354,6 +387,38 @@ CPU pCPU%c1 CPU%c1 .fi +.SH ADD PMT COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number 0. +We add two counters, showing crystal clock count and the DC6 residency. +All the parameters passed are based on the metadata found in the PMT XML files. + +For the crystal clock count, we +label it with the column header, "XTAL", +we set the type to 'raw', to read the number of clock ticks in hex, +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +For the DC6 residency counter, we +label it with the column header, "Die%c6", +we set the type to 'txtal_time', to obtain the percent residency value +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +.nf +sudo ./turbostat --quiet --cpu 0 --show CPU --add pmt,name=XTAL,type=raw,format=delta,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102 --add pmt,name=Die%c6,type=txtal_time,format=delta,domain=package0,offset=120,lsb=0,msb=63,guid=0x1a067102 +0.104352 sec +CPU XTAL Die%c6 +- 0x0000006d4d957ca7 0.00 +0 0x0000006d4d957ca7 0.00 +0.102448 sec +.fi + .SH INPUT For interval-mode, turbostat will immediately end the current interval -- cgit From 19d076903b95896ce55c7cc3679f795731591ac6 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 24 Jul 2024 13:17:30 +0200 Subject: tools/power turbostat: Include umask=%x in perf counter's config Some counters, like cpu/cache-misses/, expose and require umask=%x parameter alongside event=%x in the sysfs perf counter's event file. This change make sure we parse and use it when opening user added counters. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 60 +++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f769e8beabd3..ed5f032c6b3f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4017,15 +4017,55 @@ static unsigned int read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned int read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_perf_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; - const char *const format = "event=%x"; + FILE *fconfig = NULL; char path[128]; + char config_str[64]; + unsigned int config; + unsigned int umask; + bool has_config = false; + bool has_umask = false; + unsigned int ret = -1; snprintf(path, sizeof(path), path_format, subsys, event_name); - return read_perf_counter_info_n(path, format); + fconfig = fopen(path, "r"); + if (!fconfig) + return -1; + + if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str) + goto cleanup_and_exit; + + for (char *pconfig_str = &config_str[0]; pconfig_str;) { + if (sscanf(pconfig_str, "event=%x", &config) == 1) { + has_config = true; + goto next; + } + + if (sscanf(pconfig_str, "umask=%x", &umask) == 1) { + has_umask = true; + goto next; + } + + next: + pconfig_str = strchr(pconfig_str, ','); + if (pconfig_str) { + *pconfig_str = '\0'; + ++pconfig_str; + } + } + + if (!has_umask) + umask = 0; + + if (has_config) + ret = (umask << 8) | config; + +cleanup_and_exit: + fclose(fconfig); + return ret; } static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) @@ -4044,7 +4084,7 @@ static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_na return RAPL_UNIT_INVALID; } -static double read_perf_rapl_scale(const char *subsys, const char *event_name) +static double read_perf_scale(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; const char *const format = "%lf"; @@ -7425,7 +7465,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc if (no_perf) return -1; - const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name); if (scale == 0.0) return -1; @@ -7436,7 +7476,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const unsigned int rapl_type = read_perf_type(cai->perf_subsys); - const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -7598,7 +7638,7 @@ int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const s return -1; const unsigned int type = read_perf_type(cai->perf_subsys); - const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); @@ -7628,7 +7668,7 @@ int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct return -1; const unsigned int type = read_perf_type(cai->perf_subsys); - const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); @@ -8571,7 +8611,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) continue; } - perf_config = read_rapl_config(pinfo->device, pinfo->event); + perf_config = read_perf_config(pinfo->device, pinfo->event); if (perf_config == (unsigned int)-1) { warnx("%s: perf/%s/%s: failed to read %s", __func__, pinfo->device, pinfo->event, "config"); @@ -8579,7 +8619,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) } /* Scale is not required, some counters just don't have it. */ - perf_scale = read_perf_rapl_scale(pinfo->device, pinfo->event); + perf_scale = read_perf_scale(pinfo->device, pinfo->event); if (perf_scale == 0.0) perf_scale = 1.0; -- cgit From 866d2d36b81d7d0e6d91423b6dd9b1bcfd0510dd Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 26 Jul 2024 14:16:28 -0400 Subject: tools/power turbostat: version 2024.07.26 Release 2024.07.26: Enable turbostat extensions to add both perf and PMT (Intel Platform Monitoring Technology) counters from the cmdline. Demonstrate PMT access with built-in support for Meteor Lake's Die%c6 counter. This commit: Clean up white-space nits introduced since version 2024.05.10 Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 105 +++++++++++++++++----------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ed5f032c6b3f..089220aaa5c9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1057,8 +1057,7 @@ void probe_platform_features(unsigned int family, unsigned int model) return; for (i = 0; turbostat_pdata[i].features; i++) { - if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && - VFM_MODEL(turbostat_pdata[i].vfm) == model) { + if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) { platform = turbostat_pdata[i].features; return; } @@ -1448,28 +1447,28 @@ enum msr_arch_info_index { static struct msr_counter_arch_info msr_counter_arch_infos[] = { [MSR_ARCH_INFO_APERF_INDEX] = { - .perf_subsys = "msr", - .perf_name = "aperf", - .msr = MSR_IA32_APERF, - .msr_mask = 0xFFFFFFFFFFFFFFFF, - .rci_index = MSR_RCI_INDEX_APERF, - }, + .perf_subsys = "msr", + .perf_name = "aperf", + .msr = MSR_IA32_APERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_APERF, + }, [MSR_ARCH_INFO_MPERF_INDEX] = { - .perf_subsys = "msr", - .perf_name = "mperf", - .msr = MSR_IA32_MPERF, - .msr_mask = 0xFFFFFFFFFFFFFFFF, - .rci_index = MSR_RCI_INDEX_MPERF, - }, + .perf_subsys = "msr", + .perf_name = "mperf", + .msr = MSR_IA32_MPERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_MPERF, + }, [MSR_ARCH_INFO_SMI_INDEX] = { - .perf_subsys = "msr", - .perf_name = "smi", - .msr = MSR_SMI_COUNT, - .msr_mask = 0xFFFFFFFF, - .rci_index = MSR_RCI_INDEX_SMI, - }, + .perf_subsys = "msr", + .perf_name = "smi", + .msr = MSR_SMI_COUNT, + .msr_mask = 0xFFFFFFFF, + .rci_index = MSR_RCI_INDEX_SMI, + }, }; /* Can be redefined when compiling, useful for testing. */ @@ -1535,14 +1534,14 @@ struct pmt_counter { unsigned int pmt_counter_get_width(const struct pmt_counter *p) { - return (p->msb - p->lsb)+1; + return (p->msb - p->lsb) + 1; } void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size) { struct pmt_domain_info *new_mem; - new_mem = (struct pmt_domain_info*) reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); + new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); if (!new_mem) { fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__); exit(1); @@ -1567,7 +1566,7 @@ void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) */ if (new_size < 8) new_size = 8; - new_size = MAX(new_size, pcounter->num_domains*2); + new_size = MAX(new_size, pcounter->num_domains * 2); pmt_counter_resize_(pcounter, new_size); } @@ -2282,7 +2281,7 @@ void print_header(char *delim) ppmt = sys.pmt_tp; while (ppmt) { - switch(ppmt->type) { + switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); @@ -2356,7 +2355,7 @@ void print_header(char *delim) ppmt = sys.pmt_cp; while (ppmt) { - switch(ppmt->type) { + switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); @@ -2487,7 +2486,7 @@ void print_header(char *delim) ppmt = sys.pmt_pp; while (ppmt) { - switch(ppmt->type) { + switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); @@ -2969,7 +2968,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); if (DO_BIC(BIC_Diec6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) @@ -4049,7 +4049,7 @@ static unsigned int read_perf_config(const char *subsys, const char *event_name) goto next; } - next: +next: pconfig_str = strchr(pconfig_str, ','); if (pconfig_str) { *pconfig_str = '\0'; @@ -4463,9 +4463,9 @@ unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) if (msb == 63) mask = 0xffffffffffffffff; else - mask = ((1<<(msb+1))-1); + mask = ((1 << (msb + 1)) - 1); - mask -= (1<device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); } pinfo = pinfo->next; @@ -8687,7 +8687,7 @@ int parse_telem_info_file(int fd_dir, const char *info_filename, const char *for return 0; } -struct pmt_mmio* pmt_mmio_open(unsigned int target_guid) +struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) { DIR *dirp; struct dirent *entry; @@ -8793,7 +8793,7 @@ loop_cleanup_and_break: return ret; } -struct pmt_mmio* pmt_mmio_find(unsigned int guid) +struct pmt_mmio *pmt_mmio_find(unsigned int guid) { struct pmt_mmio *pmmio = pmt_mmios; @@ -8802,22 +8802,22 @@ struct pmt_mmio* pmt_mmio_find(unsigned int guid) return pmmio; pmmio = pmmio->next; - } + } return NULL; } -void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) +void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) { char *ret; /* Get base of mmaped PMT file. */ - ret = (char*)pmmio->mmio_base; + ret = (char *)pmmio->mmio_base; /* * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data. * It's not guaranteed that the mmaped memory begins with the telemetry data - * - we might have to apply the offset first. + * - we might have to apply the offset first. */ ret += pmmio->pmt_offset; @@ -8827,7 +8827,7 @@ void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offs return ret; } -struct pmt_mmio* pmt_add_guid(unsigned int guid) +struct pmt_mmio *pmt_add_guid(unsigned int guid) { struct pmt_mmio *ret; @@ -8843,7 +8843,7 @@ enum pmt_open_mode { PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */ }; -struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *name) +struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name) { while (pcounter) { if (strcmp(pcounter->name, name) == 0) @@ -8855,9 +8855,9 @@ struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *n return pcounter; } -struct pmt_counter** pmt_get_scope_root(enum counter_scope scope) +struct pmt_counter **pmt_get_scope_root(enum counter_scope scope) { - switch(scope) { + switch (scope) { case SCOPE_CPU: return &sys.pmt_tp; case SCOPE_CORE: @@ -8873,7 +8873,7 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, { /* Make sure the new domain fits. */ if (domain_id >= pcounter->num_domains) - pmt_counter_resize(pcounter, domain_id+1); + pmt_counter_resize(pcounter, domain_id + 1); assert(pcounter->domains); assert(domain_id < pcounter->num_domains); @@ -8882,12 +8882,12 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, } int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, - unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, - enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) + unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, + enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) { struct pmt_mmio *mmio; struct pmt_counter *pcounter; - struct pmt_counter ** const pmt_root = pmt_get_scope_root(scope); + struct pmt_counter **const pmt_root = pmt_get_scope_root(scope); bool new_counter = false; int conflict = 0; @@ -8927,7 +8927,7 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, } if (new_counter) { - strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name)-1); + strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1); pcounter->type = type; pcounter->scope = scope; pcounter->lsb = lsb; @@ -9071,7 +9071,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.05.10 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.07.26 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 @@ -9299,7 +9299,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char // FIXME: we might not have debug here yet if (debug) fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", - __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); return 0; } @@ -9450,10 +9450,10 @@ void parse_add_command_pmt(char *add_command) bool has_offset = false; bool has_lsb = false; bool has_msb = false; - bool has_format = true; /* Format has a default value. */ + bool has_format = true; /* Format has a default value. */ bool has_guid = false; bool has_scope = false; - bool has_type = true; /* Type has a default value. */ + bool has_type = true; /* Type has a default value. */ /* Consume the "pmt," prefix. */ add_command = strchr(add_command, ','); @@ -9490,7 +9490,7 @@ void parse_add_command_pmt(char *add_command) if (!has_scope) { printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", - __func__); + __func__); exit(1); } @@ -9536,8 +9536,7 @@ next: } if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) { - printf("%s: name has to be at most %d characters long\n", - __func__, PMT_COUNTER_NAME_SIZE_BYTES); + printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES); exit(1); } -- cgit From 6dc55268f64b780eb8774de3705f791b689853bb Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 23 Jul 2024 18:09:03 -0500 Subject: dt-bindings: iio: adc: ad7192: Fix 'single-channel' constraints The 'single-channel' property is an uint32, not an array, so 'items' is an incorrect constraint. This didn't matter until dtschema recently changed how properties are decoded. This results in this warning: Documentation/devicetree/bindings/iio/adc/adi,ad7192.example.dtb: adc@0: \ channel@1:single-channel: 1 is not of type 'array' Fixes: caf7b7632b8d ("dt-bindings: iio: adc: ad7192: Add AD7194 support") Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240723230904.1299744-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml index a03da9489ed9..190889c7b62a 100644 --- a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml +++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml @@ -120,9 +120,8 @@ patternProperties: description: Positive input can be connected to pins AIN1 to AIN16 by choosing the appropriate value from 1 to 16. Negative input is connected to AINCOM. - items: - minimum: 1 - maximum: 16 + minimum: 1 + maximum: 16 oneOf: - required: -- cgit From 5bf6f3c595d38e3e92130373ae4c0799da5026ee Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 9 Jul 2024 11:25:10 +0100 Subject: MAINTAINERS: mailmap: update James Clark's email address My new address is james.clark@linaro.org Link: https://lkml.kernel.org/r/20240709102512.31212-2-james.clark@linaro.org Signed-off-by: James Clark Cc: Bjorn Andersson Cc: Conor Dooley Cc: David S. Miller Cc: Geliang Tang Cc: Hao Zhang Cc: Jakub Kicinski Cc: Jiri Kosina Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Mao Jinlong Cc: Matthieu Baerts Cc: Matt Ranostay Cc: Mike Leach Cc: Oleksij Rempel Cc: Rob Herring (Arm) Cc: Suzuki K Poulose Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index cbdf5d63c381..e51d76df75c2 100644 --- a/.mailmap +++ b/.mailmap @@ -260,6 +260,7 @@ Jaegeuk Kim Jakub Kicinski James Bottomley James Bottomley +James Clark James E Wilson James Hogan James Hogan diff --git a/MAINTAINERS b/MAINTAINERS index c0a3d9e93689..5bbcfea5e3fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2196,7 +2196,7 @@ N: digicolor ARM/CORESIGHT FRAMEWORK AND DRIVERS M: Suzuki K Poulose R: Mike Leach -R: James Clark +R: James Clark L: coresight@lists.linaro.org (moderated for non-subscribers) L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -17894,7 +17894,7 @@ F: tools/perf/ PERFORMANCE EVENTS TOOLING ARM64 R: John Garry R: Will Deacon -R: James Clark +R: James Clark R: Mike Leach R: Leo Yan L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -- cgit From 34e526f6182e12b71f6076d43760dc6e0ae175a3 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 9 Jul 2024 11:25:11 +0100 Subject: dt-bindings: arm: update James Clark's email address My new address is james.clark@linaro.org Link: https://lkml.kernel.org/r/20240709102512.31212-3-james.clark@linaro.org Signed-off-by: James Clark Cc: Bjorn Andersson Cc: Conor Dooley Cc: David S. Miller Cc: Geliang Tang Cc: Hao Zhang Cc: Jakub Kicinski Cc: Jiri Kosina Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Mao Jinlong Cc: Matthieu Baerts Cc: Matt Ranostay Cc: Mike Leach Cc: Oleksij Rempel Cc: Rob Herring (Arm) Cc: Suzuki K Poulose Signed-off-by: Andrew Morton --- Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml | 2 +- Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml index c960c8e0a9a5..08b89b62c505 100644 --- a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml +++ b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml @@ -30,7 +30,7 @@ description: | maintainers: - Mike Leach - Suzuki K Poulose - - James Clark + - James Clark - Mao Jinlong - Hao Zhang diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml index 6745b4cc8f1c..d50a60368e27 100644 --- a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml +++ b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml @@ -29,7 +29,7 @@ description: | maintainers: - Mike Leach - Suzuki K Poulose - - James Clark + - James Clark - Mao Jinlong - Hao Zhang -- cgit From 4cd7ba16a0afb36550eed7690e73d3e7a743fa96 Mon Sep 17 00:00:00 2001 From: Ram Tummala Date: Tue, 9 Jul 2024 18:45:39 -0700 Subject: mm: fix old/young bit handling in the faulting path Commit 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()") replaced do_set_pte() with set_pte_range() and that introduced a regression in the following faulting path of non-anonymous vmas which caused the PTE for the faulting address to be marked as old instead of young. handle_pte_fault() do_pte_missing() do_fault() do_read_fault() || do_cow_fault() || do_shared_fault() finish_fault() set_pte_range() The polarity of prefault calculation is incorrect. This leads to prefault being incorrectly set for the faulting address. The following check will incorrectly mark the PTE old rather than young. On some architectures this will cause a double fault to mark it young when the access is retried. if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); On a subsequent fault on the same address, the faulting path will see a non NULL vmf->pte and instead of reaching the do_pte_missing() path, PTE will then be correctly marked young in handle_pte_fault() itself. Due to this bug, performance degradation in the fault handling path will be observed due to unnecessary double faulting. Link: https://lkml.kernel.org/r/20240710014539.746200-1-rtummala@nvidia.com Fixes: 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()") Signed-off-by: Ram Tummala Reviewed-by: Yin Fengwei Cc: Alistair Popple Cc: Matthew Wilcox (Oracle) Cc: Yin Fengwei Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 1ff7b6f51ec1..34f8402d2046 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4780,7 +4780,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); + bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); -- cgit From d9592025000b3cf26c742f3505da7b83aedc26d5 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 12 Jul 2024 08:58:55 -0700 Subject: mm: huge_memory: use !CONFIG_64BIT to relax huge page alignment on 32 bit machines Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") didn't work for x86_32 [1]. It is because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT. !CONFIG_64BIT should cover all 32 bit machines. [1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") Signed-off-by: Yang Shi Reported-by: Yves-Alexis Perez Tested-by: Yves-Alexis Perez Acked-by: David Hildenbrand Cc: Ben Hutchings Cc: Christoph Lameter Cc: Jiri Slaby Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Salvatore Bonaccorso Cc: Suren Baghdasaryan Cc: [6.8+] Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9696c94e211..04b72912035d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -877,7 +877,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, loff_t off_align = round_up(off, size); unsigned long len_pad, ret, off_sub; - if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) return 0; if (off_end <= off_align || (off_end - off_align) < size) -- cgit From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 15 Jul 2024 10:04:23 +1000 Subject: mm/huge_memory: avoid PMD-size page cache if needed xarray can't support arbitrary page cache size. the largest and supported page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However, it's possible to have 512MB page cache in the huge memory's collapsing path on ARM64 system whose base page size is 64KB. 512MB page cache is breaking the limitation and a warning is raised when the xarray entry is split as shown in the following example. [root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize KernelPageSize: 64 kB [root@dhcp-10-26-1-207 ~]# cat /tmp/test.c : int main(int argc, char **argv) { const char *filename = TEST_XFS_FILENAME; int fd = 0; void *buf = (void *)-1, *p; int pgsize = getpagesize(); int ret = 0; if (pgsize != 0x10000) { fprintf(stdout, "System with 64KB base page size is required!\n"); return -EPERM; } system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb"); system("echo 1 > /proc/sys/vm/drop_caches"); /* Open the xfs file */ fd = open(filename, O_RDONLY); assert(fd > 0); /* Create VMA */ buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0); assert(buf != (void *)-1); fprintf(stdout, "mapped buffer at 0x%p\n", buf); /* Populate VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ); assert(ret == 0); /* Collapse VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE); if (ret) { fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno); goto out; } /* Split xarray entry. Write permission is needed */ munmap(buf, TEST_MEM_SIZE); buf = (void *)-1; close(fd); fd = open(filename, O_RDWR); assert(fd > 0); fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, TEST_MEM_SIZE - pgsize, pgsize); out: if (buf != (void *)-1) munmap(buf, TEST_MEM_SIZE); if (fd > 0) close(fd); return ret; } [root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test [root@dhcp-10-26-1-207 ~]# /tmp/test ------------[ cut here ]------------ WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \ xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \ sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x780 sp : ffff8000ac32f660 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x780 truncate_inode_partial_folio+0xdc/0x160 truncate_inode_pages_range+0x1b4/0x4a8 truncate_pagecache_range+0x84/0xa0 xfs_flush_unmap_range+0x70/0x90 [xfs] xfs_file_fallocate+0xfc/0x4d8 [xfs] vfs_fallocate+0x124/0x2f0 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Fix it by correcting the supported page cache orders, different sets for DAX and other files. With it corrected, 512MB page cache becomes disallowed on all non-DAX files on ARM64 system where the base page size is 64KB. After this patch is applied, the test program fails with error -EINVAL returned from __thp_vma_allowable_orders() and the madvise() system call to collapse the page caches. Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Reviewed-by: Ryan Roberts Acked-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Don Dutile Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: William Kucharski Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 +++++++++--- mm/huge_memory.c | 12 ++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cff002be83eb..e25d9ebfdf89 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -74,14 +74,20 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) /* - * Mask of all large folio orders supported for file THP. + * Mask of all large folio orders supported for file THP. Folios in a DAX + * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to + * it. */ -#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DAX \ + (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DEFAULT \ + ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) /* * Mask of all large folio orders supported for THP. */ -#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) +#define THP_ORDERS_ALL \ + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 04b72912035d..f4be468e06a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,9 +89,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, bool smaps = tva_flags & TVA_SMAPS; bool in_pf = tva_flags & TVA_IN_PF; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + unsigned long supported_orders; + /* Check the intersection of requested and supported orders. */ - orders &= vma_is_anonymous(vma) ? - THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (vma_is_anonymous(vma)) + supported_orders = THP_ORDERS_ALL_ANON; + else if (vma_is_dax(vma)) + supported_orders = THP_ORDERS_ALL_FILE_DAX; + else + supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; + + orders &= supported_orders; if (!orders) return 0; -- cgit From bf6acd5d16057d7accbbb1bf7dc6d8c56eeb4ecc Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 17 Jul 2024 17:20:16 +0100 Subject: decompress_bunzip2: fix rare decompression failure The decompression code parses a huffman tree and counts the number of symbols for a given bit length. In rare cases, there may be >= 256 symbols with a given bit length, causing the unsigned char to overflow. This causes a decompression failure later when the code tries and fails to find the bit length for a given symbol. Since the maximum number of symbols is 258, use unsigned short instead. Link: https://lkml.kernel.org/r/20240717162016.1514077-1-ross.lagerwall@citrix.com Fixes: bc22c17e12c1 ("bzip2/lzma: library support for gzip, bzip2 and lzma decompression") Signed-off-by: Ross Lagerwall Cc: Alain Knaff Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton --- lib/decompress_bunzip2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 3518e7394eca..ca736166f100 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -232,7 +232,8 @@ static int INIT get_next_block(struct bunzip_data *bd) RUNB) */ symCount = symTotal+2; for (j = 0; j < groupCount; j++) { - unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1]; + unsigned char length[MAX_SYMBOLS]; + unsigned short temp[MAX_HUFCODE_BITS+1]; int minLen, maxLen, pp; /* Read Huffman code lengths for each symbol. They're stored in a way similar to mtf; record a starting -- cgit From b3bebe44306e23827397d0d774d206e3fa374041 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 17 Jul 2024 14:28:44 -0700 Subject: alloc_tag: outline and export free_reserved_page() Outline and export free_reserved_page() because modules use it and it in turn uses page_ext_{get|put} which should not be exported. The same result could be obtained by outlining {get|put}_page_tag_ref() but that would have higher performance impact as these functions are used in more performance critical paths. Link: https://lkml.kernel.org/r/20240717212844.2749975-1-surenb@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407080044.DWMC9N9I-lkp@intel.com/ Suggested-by: Christoph Hellwig Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: [6.10] Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +--------------- mm/page_alloc.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4563b560ced7..c4b238a20b76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3137,21 +3137,7 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void free_reserved_page(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); - } - } - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - adjust_managed_page_count(page, 1); -} +void free_reserved_page(struct page *page); #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8337926b89d4..7ac8d61148fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5815,6 +5815,23 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } +void free_reserved_page(struct page *page) +{ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + adjust_managed_page_count(page, 1); +} +EXPORT_SYMBOL(free_reserved_page); + static int page_alloc_cpu_dead(unsigned int cpu) { struct zone *zone; -- cgit From f59adcf5933271ab7247c4c8938c67be8905b725 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 23 Jul 2024 17:12:44 +0000 Subject: mm: memcg: add cacheline padding after lruvec in mem_cgroup_per_node Oliver Sand reported a performance regression caused by commit 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node"), which puts some fields of the mem_cgroup_per_node structure under the CONFIG_MEMCG_V1 config option. Apparently it causes a false cache sharing between lruvec and lru_zone_size members of the structure. Fix it by adding an explicit padding after the lruvec member. Even though the padding is not required with CONFIG_MEMCG_V1 set, it seems like the introduced memory overhead is not significant enough to warrant another divergence in the mem_cgroup_per_node layout, so the padding is added unconditionally. Link: https://lkml.kernel.org/r/20240723171244.747521-1-roman.gushchin@linux.dev Fixes: 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node") Signed-off-by: Roman Gushchin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407121335.31a10cb6-oliver.sang@intel.com Tested-by: Oliver Sang Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7e2eb091049a..0e5bf25d324f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -109,6 +109,7 @@ struct mem_cgroup_per_node { /* Fields which get updated often at the end. */ struct lruvec lruvec; + CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; }; -- cgit From 66eca1021a42856d6af2a9802c99e160278aed91 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 23 Jul 2024 14:44:28 +0800 Subject: mm/page_alloc: fix pcp->count race between drain_pages_zone() vs __rmqueue_pcplist() It's expected that no page should be left in pcp_list after calling zone_pcp_disable() in offline_pages(). Previously, it's observed that offline_pages() gets stuck [1] due to some pages remaining in pcp_list. Cause: There is a race condition between drain_pages_zone() and __rmqueue_pcplist() involving the pcp->count variable. See below scenario: CPU0 CPU1 ---------------- --------------- spin_lock(&pcp->lock); __rmqueue_pcplist() { zone_pcp_disable() { /* list is empty */ if (list_empty(list)) { /* add pages to pcp_list */ alloced = rmqueue_bulk() mutex_lock(&pcp_batch_high_lock) ... __drain_all_pages() { drain_pages_zone() { /* read pcp->count, it's 0 here */ count = READ_ONCE(pcp->count) /* 0 means nothing to drain */ /* update pcp->count */ pcp->count += alloced << order; ... ... spin_unlock(&pcp->lock); In this case, after calling zone_pcp_disable() though, there are still some pages in pcp_list. And these pages in pcp_list are neither movable nor isolated, offline_pages() gets stuck as a result. Solution: Expand the scope of the pcp->lock to also protect pcp->count in drain_pages_zone(), to ensure no pages are left in the pcp list after zone_pcp_disable() [1] https://lore.kernel.org/linux-mm/6a07125f-e720-404c-b2f9-e55f3f166e85@fujitsu.com/ Link: https://lkml.kernel.org/r/20240723064428.1179519-1-lizhijian@fujitsu.com Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") Signed-off-by: Li Zhijian Reported-by: Yao Xingtao Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ac8d61148fe..28f80daf5c04 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2343,16 +2343,20 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - int count = READ_ONCE(pcp->count); - - while (count) { - int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); - count -= to_drain; + int count; + do { spin_lock(&pcp->lock); - free_pcppages_bulk(zone, to_drain, pcp, 0); + count = pcp->count; + if (count) { + int to_drain = min(count, + pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); + + free_pcppages_bulk(zone, to_drain, pcp, 0); + count -= to_drain; + } spin_unlock(&pcp->lock); - } + } while (count); } /* -- cgit From f556acc2facdd240de2c7416cd1da48a1dae70ec Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 18 Jul 2024 10:55:04 +0530 Subject: selftests/mm: skip test for non-LPA2 and non-LVA systems Post my improvement of the test in e4a4ba415419 ("selftests/mm: va_high_addr_switch: dynamically initialize testcases to enable LPA2 testing"): The test begins to fail on 4k and 16k pages, on non-LPA2 systems. To reduce noise in the CI systems, let us skip the test when higher address space is not implemented. Link: https://lkml.kernel.org/r/20240718052504.356517-1-dev.jain@arm.com Fixes: e4a4ba415419 ("selftests/mm: va_high_addr_switch: dynamically initialize testcases to enable LPA2 testing") Signed-off-by: Dev Jain Reviewed-by: Ryan Roberts Cc: Anshuman Khandual Cc: Mark Brown Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index fa7eabfaf841..896b3f73fc53 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -293,6 +293,20 @@ static int run_test(struct testcase *test, int count) return ret; } +#ifdef __aarch64__ +/* Check if userspace VA > 48 bits */ +static int high_address_present(void) +{ + void *ptr = mmap((void *)(1UL << 50), 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (ptr == MAP_FAILED) + return 0; + + munmap(ptr, 1); + return 1; +} +#endif + static int supported_arch(void) { #if defined(__powerpc64__) @@ -300,7 +314,7 @@ static int supported_arch(void) #elif defined(__x86_64__) return 1; #elif defined(__aarch64__) - return 1; + return high_address_present(); #else return 0; #endif -- cgit From 4811f7af6090e8f5a398fbdd766f903ef6c0d787 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 25 Jul 2024 14:20:07 +0900 Subject: nilfs2: handle inconsistent state in nilfs_btnode_create_block() Syzbot reported that a buffer state inconsistency was detected in nilfs_btnode_create_block(), triggering a kernel bug. It is not appropriate to treat this inconsistency as a bug; it can occur if the argument block address (the buffer index of the newly created block) is a virtual block number and has been reallocated due to corruption of the bitmap used to manage its allocation state. So, modify nilfs_btnode_create_block() and its callers to treat it as a possible filesystem error, rather than triggering a kernel bug. Link: https://lkml.kernel.org/r/20240725052007.4562-1-konishi.ryusuke@gmail.com Fixes: a60be987d45d ("nilfs2: B-tree node cache") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+89cc4f2324ed37988b60@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=89cc4f2324ed37988b60 Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 25 ++++++++++++++++++++----- fs/nilfs2/btree.c | 4 ++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 0131d83b912d..c034080c334b 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -51,12 +51,21 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) - return NULL; + return ERR_PTR(-ENOMEM); if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || buffer_dirty(bh))) { - brelse(bh); - BUG(); + /* + * The block buffer at the specified new address was already + * in use. This can happen if it is a virtual block number + * and has been reallocated due to corruption of the bitmap + * used to manage its allocation state (if not, the buffer + * clearing of an abandoned b-tree node is missing somewhere). + */ + nilfs_error(inode->i_sb, + "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%lu)", + (unsigned long long)blocknr, inode->i_ino); + goto failed; } memset(bh->b_data, 0, i_blocksize(inode)); bh->b_bdev = inode->i_sb->s_bdev; @@ -67,6 +76,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) folio_unlock(bh->b_folio); folio_put(bh->b_folio); return bh; + +failed: + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); + brelse(bh); + return ERR_PTR(-EIO); } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, @@ -217,8 +232,8 @@ retry: } nbh = nilfs_btnode_create_block(btnc, newkey); - if (!nbh) - return -ENOMEM; + if (IS_ERR(nbh)) + return PTR_ERR(nbh); BUG_ON(nbh == obh); ctxt->newbh = nbh; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index a139970e4804..862bdf23120e 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -63,8 +63,8 @@ static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); - if (!bh) - return -ENOMEM; + if (IS_ERR(bh)) + return PTR_ERR(bh); set_buffer_nilfs_volatile(bh); *bhp = bh; -- cgit From e8432ac802a028eaee6b1e86383d7cd8e9fb8431 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 Jul 2024 15:09:07 -0700 Subject: minmax: avoid overly complex min()/max() macro arguments in xen We have some very fancy min/max macros that have tons of sanity checking to warn about mixed signedness etc. This is all things that a sane compiler should warn about, but there are no sane compiler interfaces for this, and '-Wsign-compare' is broken [1] and not useful. So then we compensate (some would say over-compensate) by doing the checks manually with some truly horrid macro games. And no, we can't just use __builtin_types_compatible_p(), because the whole question of "does it make sense to compare these two values" is a lot more complicated than that. For example, it makes a ton of sense to compare unsigned values with simple constants like "5", even if that is indeed a signed type. So we have these very strange macros to try to make sensible type checking decisions on the arguments to 'min()' and 'max()'. But that can cause enormous code expansion if the min()/max() macros are used with complicated expressions, and particularly if you nest these things so that you get the first big expansion then expanded again. The xen setup.c file ended up ballooning to over 50MB of preprocessed noise that takes 15s to compile (obviously depending on the build host), largely due to one single line. So let's split that one single line to just be simpler. I think it ends up being more legible to humans too at the same time. Now that single file compiles in under a second. Reported-and-reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/all/c83c17bb-be75-4c67-979d-54eee38774c6@lucifer.local/ Link: https://staticthinking.wordpress.com/2023/07/25/wsign-compare-is-garbage/ [1] Cc: David Laight Signed-off-by: Linus Torvalds --- arch/x86/xen/setup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a0c3e77e3d5b..806ddb2391d9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -690,6 +690,7 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; + unsigned long maxmem_pages; int i; int op; @@ -761,8 +762,8 @@ char * __init xen_memory_setup(void) * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages, max_pages - max_pfn); + maxmem_pages = EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)); + extra_pages = min3(maxmem_pages, extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; -- cgit From 3a7e02c040b130b5545e4b115aada7bacd80a2b6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 Jul 2024 15:32:27 -0700 Subject: minmax: avoid overly complicated constant expressions in VM code The minmax infrastructure is overkill for simple constants, and can cause huge expansions because those simple constants are then used by other things. For example, 'pageblock_order' is a core VM constant, but because it was implemented using 'min_t()' and all the type-checking that involves, it actually expanded to something like 2.5kB of preprocessor noise. And when that simple constant was then used inside other expansions: #define pageblock_nr_pages (1UL << pageblock_order) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) and we then use that inside a 'max()' macro: case ISOLATE_SUCCESS: update_cached = false; last_migrated_pfn = max(cc->zone->zone_start_pfn, pageblock_start_pfn(cc->migrate_pfn - 1)); the end result was that one statement expanding to 253kB in size. There are probably other cases of this, but this one case certainly stood out. I've added 'MIN_T()' and 'MAX_T()' macros for this kind of "core simple constant with specific type" use. These macros skip the type checking, and as such need to be very sparingly used only for obvious cases that have active issues like this. Reported-by: Lorenzo Stoakes Link: https://lore.kernel.org/all/36aa2cad-1db1-4abf-8dd2-fb20484aabc3@lucifer.local/ Cc: David Laight Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 7 +++++++ include/linux/pageblock-flags.h | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 2ec559284a9f..a7ef65f78933 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -270,4 +270,11 @@ static inline bool in_range32(u32 val, u32 start, u32 len) #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +/* + * Use these carefully: no type checking, and uses the arguments + * multiple times. Use for obvious constants only. + */ +#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) +#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) + #endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 547e82cdc89a..fc6b9c87cb0a 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,13 +41,13 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #elif defined(CONFIG_TRANSPARENT_HUGEPAGE) -#define pageblock_order min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER) #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit From 697943657444a7d7123b47bc32019e62533f4863 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 25 Jul 2024 10:03:54 -0700 Subject: fbnic: Change kconfig prompt from S390=n to !S390 In testing the recent kernel I found that the fbnic driver couldn't be enabled on x86_64 builds. A bit of digging showed that the fbnic driver was the only one to check for S390 to be n, all others had checked for !S390. Since it is a boolean and not a tristate I am not sure it will be N. So just update it to use the !S390 flag. A quick check via "make menuconfig" verified that after making this change there was an option to select the fbnic driver. Fixes 0e03c643dc93 ("eth: fbnic: fix s390 build.") Signed-off-by: Alexander Duyck Reviewed-by: Joe Damato Link: https://patch.msgid.link/172192698293.1903337.4255690118685300353.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/Kconfig b/drivers/net/ethernet/meta/Kconfig index 86034ea4ba5b..c002ede36402 100644 --- a/drivers/net/ethernet/meta/Kconfig +++ b/drivers/net/ethernet/meta/Kconfig @@ -20,7 +20,7 @@ if NET_VENDOR_META config FBNIC tristate "Meta Platforms Host Network Interface" depends on X86_64 || COMPILE_TEST - depends on S390=n + depends on !S390 depends on MAX_SKB_FRAGS < 22 depends on PCI_MSI select PHYLINK -- cgit From 00e3913b0416fe69d28745c0a2a340e2f76c219c Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 26 Jul 2024 01:16:48 +0900 Subject: Revert "firewire: Annotate struct fw_iso_packet with __counted_by()" This reverts commit d3155742db89df3b3c96da383c400e6ff4d23c25. The header_length field is byte unit, thus it can not express the number of elements in header field. It seems that the argument for counted_by attribute can have no arithmetic expression, therefore this commit just reverts the issued commit. Suggested-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240725161648.130404-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 00abe0e5d602..1cca14cf5652 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -462,9 +462,8 @@ struct fw_iso_packet { /* rx: Sync bit, wait for matching sy */ u32 tag:2; /* tx: Tag in packet header */ u32 sy:4; /* tx: Sy in packet header */ - u32 header_length:8; /* Length of immediate header */ - /* tx: Top of 1394 isoch. data_block */ - u32 header[] __counted_by(header_length); + u32 header_length:8; /* Size of immediate header */ + u32 header[]; /* tx: Top of 1394 isoch. data_block */ }; #define FW_ISO_CONTEXT_TRANSMIT 0 -- cgit From c1839501fe3e67d98635f159dba8b170d08f6521 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 26 Jul 2024 00:56:40 +0900 Subject: ALSA: firewire-lib: fix wrong value as length of header for CIP_NO_HEADER case In a commit 1d717123bb1a ("ALSA: firewire-lib: Avoid -Wflex-array-member-not-at-end warning"), DEFINE_FLEX() macro was used to handle variable length of array for header field in struct fw_iso_packet structure. The usage of macro has a side effect that the designated initializer assigns the count of array to the given field. Therefore CIP_HEADER_QUADLETS (=2) is assigned to struct fw_iso_packet.header, while the original designated initializer assigns zero to all fields. With CIP_NO_HEADER flag, the change causes invalid length of header in isochronous packet for 1394 OHCI IT context. This bug affects all of devices supported by ALSA fireface driver; RME Fireface 400, 800, UCX, UFX, and 802. This commit fixes the bug by replacing it with the alternative version of macro which corresponds no initializer. Cc: stable@vger.kernel.org Fixes: 1d717123bb1a ("ALSA: firewire-lib: Avoid -Wflex-array-member-not-at-end warning") Reported-by: Edmund Raile Closes: https://lore.kernel.org/r/rrufondjeynlkx2lniot26ablsltnynfaq2gnqvbiso7ds32il@qk4r6xps7jh2/ Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/20240725155640.128442-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- sound/firewire/amdtp-stream.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index d35d0a420ee0..1a163bbcabd7 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -1180,8 +1180,7 @@ static void process_rx_packets(struct fw_iso_context *context, u32 tstamp, size_ (void)fw_card_read_cycle_time(fw_parent_device(s->unit)->card, &curr_cycle_time); for (i = 0; i < packets; ++i) { - DEFINE_FLEX(struct fw_iso_packet, template, header, - header_length, CIP_HEADER_QUADLETS); + DEFINE_RAW_FLEX(struct fw_iso_packet, template, header, CIP_HEADER_QUADLETS); bool sched_irq = false; build_it_pkt_header(s, desc->cycle, template, pkt_header_length, -- cgit From e1c5ae59c0f22f7fe5c07fb5513a29e4aad868c9 Mon Sep 17 00:00:00 2001 From: "Seth Forshee (DigitalOcean)" Date: Wed, 24 Jul 2024 09:53:59 -0500 Subject: fs: don't allow non-init s_user_ns for filesystems without FS_USERNS_MOUNT Christian noticed that it is possible for a privileged user to mount most filesystems with a non-initial user namespace in sb->s_user_ns. When fsopen() is called in a non-init namespace the caller's namespace is recorded in fs_context->user_ns. If the returned file descriptor is then passed to a process priviliged in init_user_ns, that process can call fsconfig(fd_fs, FSCONFIG_CMD_CREATE), creating a new superblock with sb->s_user_ns set to the namespace of the process which called fsopen(). This is problematic. We cannot assume that any filesystem which does not set FS_USERNS_MOUNT has been written with a non-initial s_user_ns in mind, increasing the risk for bugs and security issues. Prevent this by returning EPERM from sget_fc() when FS_USERNS_MOUNT is not set for the filesystem and a non-initial user namespace will be used. sget() does not need to be updated as it always uses the user namespace of the current context, or the initial user namespace if SB_SUBMOUNT is set. Fixes: cb50b348c71f ("convenience helpers: vfs_get_super() and sget_fc()") Reported-by: Christian Brauner Signed-off-by: Seth Forshee (DigitalOcean) Link: https://lore.kernel.org/r/20240724-s_user_ns-fix-v1-1-895d07c94701@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/super.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/super.c b/fs/super.c index 095ba793e10c..38d72a3cf6fc 100644 --- a/fs/super.c +++ b/fs/super.c @@ -736,6 +736,17 @@ struct super_block *sget_fc(struct fs_context *fc, struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; + /* + * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is + * not set, as the filesystem is likely unprepared to handle it. + * This can happen when fsconfig() is called from init_user_ns with + * an fs_fd opened in another user namespace. + */ + if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); + return ERR_PTR(-EPERM); + } + retry: spin_lock(&sb_lock); if (test) { -- cgit From ef9ca17ca458ac7253ae71b552e601e49311fc48 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 25 Jul 2024 14:51:30 +0800 Subject: hostfs: fix the host directory parse when mounting. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hostfs not keep the host directory when mounting. When the host directory is none (default), fc->source is used as the host root directory, and this is wrong. Here we use `parse_monolithic` to handle the old mount path for parsing the root directory. For new mount path, The `parse_param` is used for the host directory parse. Reported-and-tested-by: Maciej Żenczykowski Fixes: cd140ce9f611 ("hostfs: convert hostfs to use the new mount API") Link: https://lore.kernel.org/all/CANP3RGceNzwdb7w=vPf5=7BCid5HVQDmz1K5kC9JG42+HVAh_g@mail.gmail.com/ Cc: Christian Brauner Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240725065130.1821964-1-lihongbo22@huawei.com [brauner: minor fixes] Signed-off-by: Christian Brauner --- fs/hostfs/hostfs_kern.c | 65 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2b532670a561..e866b08e448f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "hostfs.h" #include @@ -927,7 +928,6 @@ static const struct inode_operations hostfs_link_iops = { static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hostfs_fs_info *fsi = sb->s_fs_info; - const char *host_root = fc->source; struct inode *root_inode; int err; @@ -941,15 +941,6 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - /* NULL is printed as '(null)' by printf(): avoid that. */ - if (fc->source == NULL) - host_root = ""; - - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) - return -ENOMEM; - root_inode = hostfs_iget(sb, fsi->host_root_path); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -975,6 +966,58 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +enum hostfs_parma { + Opt_hostfs, +}; + +static const struct fs_parameter_spec hostfs_param_specs[] = { + fsparam_string_empty("hostfs", Opt_hostfs), + {} +}; + +static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct hostfs_fs_info *fsi = fc->s_fs_info; + struct fs_parse_result result; + char *host_root; + int opt; + + opt = fs_parse(fc, hostfs_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_hostfs: + host_root = param->string; + if (!*host_root) + host_root = ""; + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) + return -ENOMEM; + break; + } + + return 0; +} + +static int hostfs_parse_monolithic(struct fs_context *fc, void *data) +{ + struct hostfs_fs_info *fsi = fc->s_fs_info; + char *host_root = (char *)data; + + /* NULL is printed as '(null)' by printf(): avoid that. */ + if (host_root == NULL) + host_root = ""; + + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) + return -ENOMEM; + + return 0; +} + static int hostfs_fc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, hostfs_fill_super); @@ -992,6 +1035,8 @@ static void hostfs_fc_free(struct fs_context *fc) } static const struct fs_context_operations hostfs_context_ops = { + .parse_monolithic = hostfs_parse_monolithic, + .parse_param = hostfs_parse_param, .get_tree = hostfs_fc_get_tree, .free = hostfs_fc_free, }; -- cgit From d01c14074be79e5f5270498f90530a12583fbf7a Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Fri, 26 Jul 2024 11:00:26 +0200 Subject: kbuild: rpm-pkg: ghost modules.weakdep file In the same way as for other similar files, mark as ghost the new file generated by depmod for configured weak dependencies for modules, modules.weakdep, so that although it is not included in the package, claim the ownership on it. Signed-off-by: Jose Ignacio Tornos Martinez Signed-off-by: Masahiro Yamada --- scripts/package/kernel.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index 74355ff0e106..ac3e5ac01d8a 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -74,7 +74,7 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA echo "/lib/modules/%{KERNELRELEASE}" for x in alias alias.bin builtin.alias.bin builtin.bin dep dep.bin \ - devname softdep symbols symbols.bin; do + devname softdep symbols symbols.bin weakdep; do echo "%ghost /lib/modules/%{KERNELRELEASE}/modules.${x}" done -- cgit From 92a286e90203ce3e6c3a6d945fa36da419c3671f Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sat, 13 Jul 2024 09:35:19 +0200 Subject: ubi: Fix ubi_init() ubiblock_exit() section mismatch Since ubiblock_exit() is now called from an init function, the __exit section no longer makes sense. Cc: Ben Hutchings Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407131403.wZJpd8n2-lkp@intel.com/ Signed-off-by: Richard Weinberger Reviewed-by: Zhihao Cheng --- drivers/mtd/ubi/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index bf7308e8ec2f..60d0155be869 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -670,7 +670,7 @@ err_unreg: return ret; } -void __exit ubiblock_exit(void) +void ubiblock_exit(void) { ubi_unregister_volume_notifier(&ubiblock_notifier); ubiblock_remove_all(); -- cgit From 3415b10a03945b0da4a635e146750dfe5ce0f448 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 26 Jul 2024 11:05:00 -0700 Subject: kbuild: Fix '-S -c' in x86 stack protector scripts After a recent change in clang to stop consuming all instances of '-S' and '-c' [1], the stack protector scripts break due to the kernel's use of -Werror=unused-command-line-argument to catch cases where flags are not being properly consumed by the compiler driver: $ echo | clang -o - -x c - -S -c -Werror=unused-command-line-argument clang: error: argument unused during compilation: '-c' [-Werror,-Wunused-command-line-argument] This results in CONFIG_STACKPROTECTOR getting disabled because CONFIG_CC_HAS_SANE_STACKPROTECTOR is no longer set. '-c' and '-S' both instruct the compiler to stop at different stages of the pipeline ('-S' after compiling, '-c' after assembling), so having them present together in the same command makes little sense. In this case, the test wants to stop before assembling because it is looking at the textual assembly output of the compiler for either '%fs' or '%gs', so remove '-c' from the list of arguments to resolve the error. All versions of GCC continue to work after this change, along with versions of clang that do or do not contain the change mentioned above. Cc: stable@vger.kernel.org Fixes: 4f7fd4d7a791 ("[PATCH] Add the -fstack-protector option to the CFLAGS") Fixes: 60a5317ff0f4 ("x86: implement x86_32 stack protector") Link: https://github.com/llvm/llvm-project/commit/6461e537815f7fa68cef06842505353cf5600e9c [1] Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/gcc-x86_32-has-stack-protector.sh | 2 +- scripts/gcc-x86_64-has-stack-protector.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh index 825c75c5b715..9459ca4f0f11 100755 --- a/scripts/gcc-x86_32-has-stack-protector.sh +++ b/scripts/gcc-x86_32-has-stack-protector.sh @@ -5,4 +5,4 @@ # -mstack-protector-guard-reg, added by # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708 -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" +echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh index 75e4e22b986a..f680bb01aeeb 100755 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ b/scripts/gcc-x86_64-has-stack-protector.sh @@ -1,4 +1,4 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" +echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" -- cgit From 4477b39c32fdc03363affef4b11d48391e6dc9ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 13:03:48 -0700 Subject: minmax: add a few more MIN_T/MAX_T users Commit 3a7e02c040b1 ("minmax: avoid overly complicated constant expressions in VM code") added the simpler MIN_T/MAX_T macros in order to avoid some excessive expansion from the rather complicated regular min/max macros. The complexity of those macros stems from two issues: (a) trying to use them in situations that require a C constant expression (in static initializers and for array sizes) (b) the type sanity checking and MIN_T/MAX_T avoids both of these issues. Now, in the whole (long) discussion about all this, it was pointed out that the whole type sanity checking is entirely unnecessary for min_t/max_t which get a fixed type that the comparison is done in. But that still leaves min_t/max_t unnecessarily complicated due to worries about the C constant expression case. However, it turns out that there really aren't very many cases that use min_t/max_t for this, and we can just force-convert those. This does exactly that. Which in turn will then allow for much simpler implementations of min_t()/max_t(). All the usual "macros in all upper case will evaluate the arguments multiple times" rules apply. We should do all the same things for the regular min/max() vs MIN/MAX() cases, but that has the added complexity of various drivers defining their own local versions of MIN/MAX, so that needs another level of fixes first. Link: https://lore.kernel.org/all/b47fad1d0cf8449886ad148f8c013dae@AcuMS.aculab.com/ Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 2 +- drivers/edac/sb_edac.c | 4 ++-- drivers/gpu/drm/drm_color_mgmt.c | 2 +- drivers/md/dm-integrity.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- net/ipv4/proc.c | 2 +- net/ipv6/proc.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 93e54ba91fbf..f5931499c2d6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -110,7 +110,7 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) #define MAX_UNSHARED_PTRS_PER_PGD \ - max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) + MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index cbc92d3683e6..e5c05a876947 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -109,8 +109,8 @@ static const u32 knl_interleave_list[] = { 0x104, 0x10c, 0x114, 0x11c, /* 20-23 */ }; #define MAX_INTERLEAVE \ - (max_t(unsigned int, ARRAY_SIZE(sbridge_interleave_list), \ - max_t(unsigned int, ARRAY_SIZE(ibridge_interleave_list), \ + (MAX_T(unsigned int, ARRAY_SIZE(sbridge_interleave_list), \ + MAX_T(unsigned int, ARRAY_SIZE(ibridge_interleave_list), \ ARRAY_SIZE(knl_interleave_list)))) struct interleave_pkg { diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c index d021497841b8..3969dc548cff 100644 --- a/drivers/gpu/drm/drm_color_mgmt.c +++ b/drivers/gpu/drm/drm_color_mgmt.c @@ -532,7 +532,7 @@ int drm_plane_create_color_properties(struct drm_plane *plane, { struct drm_device *dev = plane->dev; struct drm_property *prop; - struct drm_prop_enum_list enum_list[max_t(int, DRM_COLOR_ENCODING_MAX, + struct drm_prop_enum_list enum_list[MAX_T(int, DRM_COLOR_ENCODING_MAX, DRM_COLOR_RANGE_MAX)]; int i, len; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 48ac628f471b..51e6964c1305 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1776,7 +1776,7 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; sector_t sector; unsigned int sectors_to_process; @@ -2064,7 +2064,7 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { @@ -2837,7 +2837,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start unlikely(from_replay) && #endif ic->internal_hash) { - char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 12689774d755..f3a1b179aaea 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2912,7 +2912,7 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv) u32 channels_to_check = tx_channel_count > rx_channel_count ? tx_channel_count : rx_channel_count; u32 chan; - int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; + int status[MAX_T(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6c4664c681ca..40053a02bae1 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -44,7 +44,7 @@ #include #include -#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) +#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6d1d9221649d..752327b10dde 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -27,7 +27,7 @@ #include #define MAX4(a, b, c, d) \ - max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) + MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) -- cgit From 017fa3e89187848fd056af757769c9e66ac3e93d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 13:50:01 -0700 Subject: minmax: simplify and clarify min_t()/max_t() implementation This simplifies the min_t() and max_t() macros by no longer making them work in the context of a C constant expression. That means that you can no longer use them for static initializers or for array sizes in type definitions, but there were only a couple of such uses, and all of them were converted (famous last words) to use MIN_T/MAX_T instead. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index a7ef65f78933..9c2848abc804 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -45,17 +45,20 @@ #define __cmp(op, x, y) ((x) __cmp_op_##op (y) ? (x) : (y)) -#define __cmp_once(op, x, y, unique_x, unique_y) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ +#define __cmp_once_unique(op, type, x, y, ux, uy) \ + ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); }) + +#define __cmp_once(op, type, x, y) \ + __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) + +#define __careful_cmp_once(op, x, y) ({ \ static_assert(__types_ok(x, y), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp(op, unique_x, unique_y); }) + __cmp_once(op, __auto_type, x, y); }) #define __careful_cmp(op, x, y) \ __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), \ - __cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y))) + __cmp(op, x, y), __careful_cmp_once(op, x, y)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) @@ -158,7 +161,7 @@ * @x: first value * @y: second value */ -#define min_t(type, x, y) __careful_cmp(min, (type)(x), (type)(y)) +#define min_t(type, x, y) __cmp_once(min, type, x, y) /** * max_t - return maximum of two values, using the specified type @@ -166,7 +169,7 @@ * @x: first value * @y: second value */ -#define max_t(type, x, y) __careful_cmp(max, (type)(x), (type)(y)) +#define max_t(type, x, y) __cmp_once(max, type, x, y) /* * Do not check the array parameter using __must_be_array(). -- cgit From 8400291e289ee6b2bf9779ff1c83a291501f017b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 14:19:55 -0700 Subject: Linux 6.11-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 2c02dafb48ce..8ad55d6e7b60 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 10 +PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Baby Opossum Posse # *DOCUMENTATION* -- cgit From 1a251f52cfdc417c84411a056bc142cbd77baef4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 15:49:18 -0700 Subject: minmax: make generic MIN() and MAX() macros available everywhere This just standardizes the use of MIN() and MAX() macros, with the very traditional semantics. The goal is to use these for C constant expressions and for top-level / static initializers, and so be able to simplify the min()/max() macros. These macro names were used by various kernel code - they are very traditional, after all - and all such users have been fixed up, with a few different approaches: - trivial duplicated macro definitions have been removed Note that 'trivial' here means that it's obviously kernel code that already included all the major kernel headers, and thus gets the new generic MIN/MAX macros automatically. - non-trivial duplicated macro definitions are guarded with #ifndef This is the "yes, they define their own versions, but no, the include situation is not entirely obvious, and maybe they don't get the generic version automatically" case. - strange use case #1 A couple of drivers decided that the way they want to describe their versioning is with #define MAJ 1 #define MIN 2 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) which adds zero value and I just did my Alexander the Great impersonation, and rewrote that pointless Gordian knot as #define DRV_VERSION "1.2" instead. - strange use case #2 A couple of drivers thought that it's a good idea to have a random 'MIN' or 'MAX' define for a value or index into a table, rather than the traditional macro that takes arguments. These values were re-written as C enum's instead. The new function-line macros only expand when followed by an open parenthesis, and thus don't clash with enum use. Happily, there weren't really all that many of these cases, and a lot of users already had the pattern of using '#ifndef' guarding (or in one case just using '#undef MIN') before defining their own private version that does the same thing. I left such cases alone. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- arch/um/drivers/mconsole_user.c | 2 ++ drivers/edac/skx_common.h | 1 - drivers/gpu/drm/amd/display/dc/core/dc_stream.c | 2 ++ .../gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c | 2 ++ drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h | 14 +++++++++---- drivers/gpu/drm/radeon/evergreen_cs.c | 2 ++ drivers/hwmon/adt7475.c | 24 +++++++++++----------- drivers/media/dvb-frontends/stv0367_priv.h | 3 +++ drivers/net/fjes/fjes_main.c | 4 +--- drivers/nfc/pn544/i2c.c | 2 -- drivers/platform/x86/sony-laptop.c | 1 - drivers/scsi/isci/init.c | 6 +----- .../pci/hive_isp_css_include/math_support.h | 5 ----- include/linux/minmax.h | 2 ++ kernel/trace/preemptirq_delay_test.c | 2 -- lib/btree.c | 1 - lib/decompress_unlzma.c | 2 ++ mm/zsmalloc.c | 2 -- tools/testing/selftests/mm/mremap_test.c | 2 ++ tools/testing/selftests/seccomp/seccomp_bpf.c | 2 ++ 20 files changed, 43 insertions(+), 38 deletions(-) diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c index e24298a734be..a04cd13c6315 100644 --- a/arch/um/drivers/mconsole_user.c +++ b/arch/um/drivers/mconsole_user.c @@ -71,7 +71,9 @@ static struct mconsole_command *mconsole_parse(struct mc_request *req) return NULL; } +#ifndef MIN #define MIN(a,b) ((a)<(b) ? (a):(b)) +#endif #define STRINGX(x) #x #define STRING(x) STRINGX(x) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 11faf1db4fa4..473421ba7a18 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -45,7 +45,6 @@ #define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) #define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) #define NUM_CHANNELS MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS) #define NUM_DIMMS MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c index be2638c763d7..9a406d74c0dd 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c @@ -35,8 +35,10 @@ #include "dc_stream_priv.h" #define DC_LOGGER dc->ctx->logger +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) #define MAX(x, y) ((x > y) ? x : y) +#endif /******************************************************************************* * Private functions diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c index 7ecf76aea950..6e064e6ae949 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c @@ -25,7 +25,9 @@ #include "hdcp.h" +#ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif #define HDCP_I2C_ADDR 0x3a /* 0x74 >> 1*/ #define KSV_READ_SIZE 0xf /* 0x6803b - 0x6802c */ #define HDCP_MAX_AUX_TRANSACTION_SIZE 16 diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h index 6f54c410c2f9..409aeec6baa9 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppevvmath.h @@ -22,12 +22,18 @@ */ #include -#define SHIFT_AMOUNT 16 /* We multiply all original integers with 2^SHIFT_AMOUNT to get the fInt representation */ +enum ppevvmath_constants { + /* We multiply all original integers with 2^SHIFT_AMOUNT to get the fInt representation */ + SHIFT_AMOUNT = 16, -#define PRECISION 5 /* Change this value to change the number of decimal places in the final output - 5 is a good default */ + /* Change this value to change the number of decimal places in the final output - 5 is a good default */ + PRECISION = 5, -#define SHIFTED_2 (2 << SHIFT_AMOUNT) -#define MAX (1 << (SHIFT_AMOUNT - 1)) - 1 /* 32767 - Might change in the future */ + SHIFTED_2 = (2 << SHIFT_AMOUNT), + + /* 32767 - Might change in the future */ + MAX = (1 << (SHIFT_AMOUNT - 1)) - 1, +}; /* ------------------------------------------------------------------------------- * NEW TYPE - fINT diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index 1fe6e0d883c7..e5577d2a19ef 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -33,8 +33,10 @@ #include "evergreen_reg_safe.h" #include "cayman_reg_safe.h" +#ifndef MIN #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif #define REG_SAFE_BM_SIZE ARRAY_SIZE(evergreen_reg_safe_bm) diff --git a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c index bc186c61a2c0..382a2bb9168a 100644 --- a/drivers/hwmon/adt7475.c +++ b/drivers/hwmon/adt7475.c @@ -22,23 +22,23 @@ #include /* Indexes for the sysfs hooks */ - -#define INPUT 0 -#define MIN 1 -#define MAX 2 -#define CONTROL 3 -#define OFFSET 3 -#define AUTOMIN 4 -#define THERM 5 -#define HYSTERSIS 6 - +enum adt_sysfs_id { + INPUT = 0, + MIN = 1, + MAX = 2, + CONTROL = 3, + OFFSET = 3, // Dup + AUTOMIN = 4, + THERM = 5, + HYSTERSIS = 6, /* * These are unique identifiers for the sysfs functions - unlike the * numbers above, these are not also indexes into an array */ + ALARM = 9, + FAULT = 10, +}; -#define ALARM 9 -#define FAULT 10 /* 7475 Common Registers */ diff --git a/drivers/media/dvb-frontends/stv0367_priv.h b/drivers/media/dvb-frontends/stv0367_priv.h index 617f605947b2..7f056d1cce82 100644 --- a/drivers/media/dvb-frontends/stv0367_priv.h +++ b/drivers/media/dvb-frontends/stv0367_priv.h @@ -25,8 +25,11 @@ #endif /* MACRO definitions */ +#ifndef MIN #define MAX(X, Y) ((X) >= (Y) ? (X) : (Y)) #define MIN(X, Y) ((X) <= (Y) ? (X) : (Y)) +#endif + #define INRANGE(X, Y, Z) \ ((((X) <= (Y)) && ((Y) <= (Z))) || \ (((Z) <= (Y)) && ((Y) <= (X))) ? 1 : 0) diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index b3ddc9a629d9..fad5b6564464 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -14,9 +14,7 @@ #include "fjes.h" #include "fjes_trace.h" -#define MAJ 1 -#define MIN 2 -#define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) +#define DRV_VERSION "1.2" #define DRV_NAME "fjes" char fjes_driver_name[] = DRV_NAME; char fjes_driver_version[] = DRV_VERSION; diff --git a/drivers/nfc/pn544/i2c.c b/drivers/nfc/pn544/i2c.c index 9fe664960b38..e2a6575b9ff7 100644 --- a/drivers/nfc/pn544/i2c.c +++ b/drivers/nfc/pn544/i2c.c @@ -126,8 +126,6 @@ struct pn544_i2c_fw_secure_blob { #define PN544_FW_CMD_RESULT_COMMAND_REJECTED 0xE0 #define PN544_FW_CMD_RESULT_CHUNK_ERROR 0xE6 -#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) - #define PN544_FW_WRITE_BUFFER_MAX_LEN 0x9f7 #define PN544_FW_I2C_MAX_PAYLOAD PN544_HCI_I2C_LLC_MAX_SIZE #define PN544_FW_I2C_WRITE_FRAME_HEADER_LEN 8 diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 3e94fdd1ea52..3197aaa69da7 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -757,7 +757,6 @@ static union acpi_object *__call_snc_method(acpi_handle handle, char *method, return result; } -#define MIN(a, b) (a > b ? b : a) static int sony_nc_buffer_call(acpi_handle handle, char *name, u64 *value, void *buffer, size_t buflen) { diff --git a/drivers/scsi/isci/init.c b/drivers/scsi/isci/init.c index d31884f82f2a..73085d2f5c43 100644 --- a/drivers/scsi/isci/init.c +++ b/drivers/scsi/isci/init.c @@ -65,11 +65,7 @@ #include "task.h" #include "probe_roms.h" -#define MAJ 1 -#define MIN 2 -#define BUILD 0 -#define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \ - __stringify(BUILD) +#define DRV_VERSION "1.2.0" MODULE_VERSION(DRV_VERSION); diff --git a/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h b/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h index 7349943bba2b..160c496784b7 100644 --- a/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h +++ b/drivers/staging/media/atomisp/pci/hive_isp_css_include/math_support.h @@ -22,11 +22,6 @@ /* force a value to a lower even value */ #define EVEN_FLOOR(x) ((x) & ~1) -/* for preprocessor and array sizing use MIN and MAX - otherwise use min and max */ -#define MAX(a, b) (((a) > (b)) ? (a) : (b)) -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - #define CEIL_DIV(a, b) (((b) != 0) ? ((a) + (b) - 1) / (b) : 0) #define CEIL_MUL(a, b) (CEIL_DIV(a, b) * (b)) #define CEIL_MUL2(a, b) (((a) + (b) - 1) & ~((b) - 1)) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 9c2848abc804..fc384714da45 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -277,6 +277,8 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ +#define MIN(a,b) __cmp(min,a,b) +#define MAX(a,b) __cmp(max,a,b) #define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) #define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index cb0871fbdb07..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; diff --git a/lib/btree.c b/lib/btree.c index 49420cae3a83..bb81d3393ac5 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -43,7 +43,6 @@ #include #include -#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NODESIZE MAX(L1_CACHE_BYTES, 128) struct btree_geo { diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c index 20a858031f12..9d34d35908da 100644 --- a/lib/decompress_unlzma.c +++ b/lib/decompress_unlzma.c @@ -37,7 +37,9 @@ #include +#ifndef MIN #define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif static long long INIT read_int(unsigned char *ptr, int size) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d6581ab7c07..2d3163e4da96 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -120,8 +120,6 @@ #define CLASS_BITS 8 #define MAGIC_VAL_BITS 8 -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) - #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 1b03bcfaefdf..5a3a9bcba640 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -22,8 +22,10 @@ #define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) +#endif #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e3f97f90d8db..8c3a73461475 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -60,7 +60,9 @@ #define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) #endif +#ifndef MIN #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#endif #ifndef PR_SET_PTRACER # define PR_SET_PTRACER 0x59616d61 -- cgit From 9f499b8c791d2983c0a31a543c51d1b2f15e8755 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 17:06:20 -0700 Subject: minmax: scsi: fix mis-use of 'clamp()' in sr.c While working on simplifying the minmax functions, and avoiding excessive macro expansion, it turns out that the sr.c use of the 'clamp()' macro has the arguments the wrong way around. The clamp logic is val = clamp(in, low, high); and it returns the input clamped to the low/high limits. But sr.c ddid speed = clamp(0, speed, 0xffff / 177); which clamps the value '0' to the range '[speed, 0xffff / 177]' and ends up being nonsensical. Happily, I don't think anybody ever cared. Fixes: 9fad9d560af5 ("scsi: sr: Fix unintentional arithmetic wraparound") Cc: Justin Stitt Cc: Kees Cook Cc: Martin K. Petersen Signed-off-by: Linus Torvalds --- drivers/scsi/sr_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index a0d2556a27bb..089653018d32 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -431,7 +431,7 @@ int sr_select_speed(struct cdrom_device_info *cdi, unsigned long speed) struct packet_command cgc; /* avoid exceeding the max speed or overflowing integer bounds */ - speed = clamp(0, speed, 0xffff / 177); + speed = clamp(speed, 0, 0xffff / 177); if (speed == 0) speed = 0xffff; /* set to max */ -- cgit From cb04e8b1d2f24c4c2c92f7b7529031fc35a16fed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 17:32:05 -0700 Subject: minmax: don't use max() in situations that want a C constant expression We only had a couple of array[] declarations, and changing them to just use 'MAX()' instead of 'max()' fixes the issue. This will allow us to simplify our min/max macros enormously, since they can now unconditionally use temporary variables to avoid using the argument values multiple times. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 2 +- drivers/input/touchscreen/cyttsp4_core.c | 2 +- drivers/irqchip/irq-sun6i-r.c | 2 +- drivers/net/can/usb/etas_es58x/es58x_devlink.c | 2 +- fs/btrfs/tree-checker.c | 2 +- lib/vsprintf.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index 88eefef05fae..91ad434bcdae 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -794,7 +794,7 @@ static const char *smu_get_feature_name(struct smu_context *smu, size_t smu_cmn_get_pp_feature_mask(struct smu_context *smu, char *buf) { - int8_t sort_feature[max(SMU_FEATURE_COUNT, SMU_FEATURE_MAX)]; + int8_t sort_feature[MAX(SMU_FEATURE_COUNT, SMU_FEATURE_MAX)]; uint64_t feature_mask; int i, feature_index; uint32_t count = 0; diff --git a/drivers/input/touchscreen/cyttsp4_core.c b/drivers/input/touchscreen/cyttsp4_core.c index 7cb26929dc73..9dc25eb2be44 100644 --- a/drivers/input/touchscreen/cyttsp4_core.c +++ b/drivers/input/touchscreen/cyttsp4_core.c @@ -871,7 +871,7 @@ static void cyttsp4_get_mt_touches(struct cyttsp4_mt_data *md, int num_cur_tch) struct cyttsp4_touch tch; int sig; int i, j, t = 0; - int ids[max(CY_TMA1036_MAX_TCH, CY_TMA4XX_MAX_TCH)]; + int ids[MAX(CY_TMA1036_MAX_TCH, CY_TMA4XX_MAX_TCH)]; memset(ids, 0, si->si_ofs.tch_abs[CY_TCH_T].max * sizeof(int)); for (i = 0; i < num_cur_tch; i++) { diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c index a01e44049415..99958d470d62 100644 --- a/drivers/irqchip/irq-sun6i-r.c +++ b/drivers/irqchip/irq-sun6i-r.c @@ -270,7 +270,7 @@ static const struct irq_domain_ops sun6i_r_intc_domain_ops = { static int sun6i_r_intc_suspend(void) { - u32 buf[BITS_TO_U32(max(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))]; + u32 buf[BITS_TO_U32(MAX(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))]; int i; /* Wake IRQs are enabled during system sleep and shutdown. */ diff --git a/drivers/net/can/usb/etas_es58x/es58x_devlink.c b/drivers/net/can/usb/etas_es58x/es58x_devlink.c index 635edeb8f68c..eee20839d96f 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_devlink.c +++ b/drivers/net/can/usb/etas_es58x/es58x_devlink.c @@ -215,7 +215,7 @@ static int es58x_devlink_info_get(struct devlink *devlink, struct es58x_sw_version *fw_ver = &es58x_dev->firmware_version; struct es58x_sw_version *bl_ver = &es58x_dev->bootloader_version; struct es58x_hw_revision *hw_rev = &es58x_dev->hardware_revision; - char buf[max(sizeof("xx.xx.xx"), sizeof("axxx/xxx"))]; + char buf[MAX(sizeof("xx.xx.xx"), sizeof("axxx/xxx"))]; int ret = 0; if (es58x_sw_version_is_valid(fw_ver)) { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6388786fd8b5..89e009a4c3a3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -634,7 +634,7 @@ static int check_dir_item(struct extent_buffer *leaf, */ if (key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_XATTR_ITEM_KEY) { - char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + char namebuf[MAX(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index cdd4e2314bfc..2d71b1115916 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1080,7 +1080,7 @@ char *resource_string(char *buf, char *end, struct resource *res, #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") - char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, + char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); -- cgit From dc1c8034e31b14a2e5e212104ec508aec44ce1b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 20:24:12 -0700 Subject: minmax: simplify min()/max()/clamp() implementation Now that we no longer have any C constant expression contexts (ie array size declarations or static initializers) that use min() or max(), we can simpify the implementation by not having to worry about the result staying as a C constant expression. So now we can unconditionally just use temporary variables of the right type, and get rid of the excessive expansion that used to come from the use of __builtin_choose_expr(__is_constexpr(...), .. to pick the specialized code for constant expressions. Another expansion simplification is to pass the temporary variables (in addition to the original expression) to our __types_ok() macro. That may superficially look like it complicates the macro, but when we only want the type of the expression, expanding the temporary variable names is much simpler and smaller than expanding the potentially complicated original expression. As a result, on my machine, doing a $ time make drivers/staging/media/atomisp/pci/isp/kernels/ynr/ynr_1.0/ia_css_ynr.host.i goes from real 0m16.621s user 0m15.360s sys 0m1.221s to real 0m2.532s user 0m2.091s sys 0m0.452s because the token expansion goes down dramatically. In particular, the longest line expansion (which was line 71 of that 'ia_css_ynr.host.c' file) shrinks from 23,338kB (yes, 23MB for one single line) to "just" 1,444kB (now "only" 1.4MB). And yes, that line is still the line from hell, because it's doing multiple levels of "min()/max()" expansion thanks to some of them being hidden inside the uDIGIT_FITTING() macro. Lorenzo has a nice cleanup patch that makes that driver use inline functions instead of macros for sDIGIT_FITTING() and uDIGIT_FITTING(), which will fix that line once and for all, but the 16-fold reduction in this case does show why we need to simplify these helpers. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index fc384714da45..e3e4353df983 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -35,10 +35,10 @@ #define __is_noneg_int(x) \ (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) -#define __types_ok(x, y) \ - (__is_signed(x) == __is_signed(y) || \ - __is_signed((x) + 0) == __is_signed((y) + 0) || \ - __is_noneg_int(x) || __is_noneg_int(y)) +#define __types_ok(x, y, ux, uy) \ + (__is_signed(ux) == __is_signed(uy) || \ + __is_signed((ux) + 0) == __is_signed((uy) + 0) || \ + __is_noneg_int(x) || __is_noneg_int(y)) #define __cmp_op_min < #define __cmp_op_max > @@ -51,34 +51,31 @@ #define __cmp_once(op, type, x, y) \ __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __careful_cmp_once(op, x, y) ({ \ - static_assert(__types_ok(x, y), \ +#define __careful_cmp_once(op, x, y, ux, uy) ({ \ + __auto_type ux = (x); __auto_type uy = (y); \ + static_assert(__types_ok(x, y, ux, uy), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp_once(op, __auto_type, x, y); }) + __cmp(op, ux, uy); }) -#define __careful_cmp(op, x, y) \ - __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), __careful_cmp_once(op, x, y)) +#define __careful_cmp(op, x, y) \ + __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) -#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \ - typeof(val) unique_val = (val); \ - typeof(lo) unique_lo = (lo); \ - typeof(hi) unique_hi = (hi); \ +#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ + __auto_type uval = (val); \ + __auto_type ulo = (lo); \ + __auto_type uhi = (hi); \ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error"); \ - __clamp(unique_val, unique_lo, unique_hi); }) - -#define __careful_clamp(val, lo, hi) ({ \ - __builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)), \ - __clamp(val, lo, hi), \ - __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \ - __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); }) + static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error"); \ + static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error"); \ + __clamp(uval, ulo, uhi); }) + +#define __careful_clamp(val, lo, hi) \ + __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) /** * min - return minimum of two values of the same or compatible types -- cgit From 9da49aa80d686582bc3a027112a30484c9be6b6e Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Fri, 26 Jul 2024 06:40:49 +0900 Subject: tun: Add missing bpf_net_ctx_clear() in do_xdp_generic() There are cases where do_xdp_generic returns bpf_net_context without clearing it. This causes various memory corruptions, so the missing bpf_net_ctx_clear must be added. Reported-by: syzbot+44623300f057a28baf1e@syzkaller.appspotmail.com Fixes: fecef4cd42c6 ("tun: Assign missing bpf_net_context.") Signed-off-by: Jeongjun Park Acked-by: Jason Wang Reviewed-by: Willem de Bruijn Reported-by: syzbot+3c2b6d5d4bec3b904933@syzkaller.appspotmail.com Reported-by: syzbot+707d98c8649695eaf329@syzkaller.appspotmail.com Reported-by: syzbot+c226757eb784a9da3e8b@syzkaller.appspotmail.com Reported-by: syzbot+61a1cfc2b6632363d319@syzkaller.appspotmail.com Reported-by: syzbot+709e4c85c904bcd62735@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index 6ea1d20676fb..751d9b70e6ad 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5150,6 +5150,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } + bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: -- cgit From daefd348a5938d2256d304b57a9e787a83bb58d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:49 -0700 Subject: eth: bnxt: reject unsupported hash functions In commit under Fixes I split the bnxt_set_rxfh_context() function, and attached the appropriate chunks to new ops. I missed that bnxt_set_rxfh_context() gets called after some initial checks in bnxt_set_rxfh(), namely that the hash function is Toeplitz. Fixes: 5c466b4d4e75 ("eth: bnxt: move from .set_rxfh to .create_rxfh_context and friends") Signed-off-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index d00ef0063820..0425a54eca98 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1863,8 +1863,14 @@ static void bnxt_modify_rss(struct bnxt *bp, struct ethtool_rxfh_context *ctx, } static int bnxt_rxfh_context_check(struct bnxt *bp, + const struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { + if (rxfh->hfunc && rxfh->hfunc != ETH_RSS_HASH_TOP) { + NL_SET_ERR_MSG_MOD(extack, "RSS hash function not supported"); + return -EOPNOTSUPP; + } + if (!BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) { NL_SET_ERR_MSG_MOD(extack, "RSS contexts not supported"); return -EOPNOTSUPP; @@ -1888,7 +1894,7 @@ static int bnxt_create_rxfh_context(struct net_device *dev, struct bnxt_vnic_info *vnic; int rc; - rc = bnxt_rxfh_context_check(bp, extack); + rc = bnxt_rxfh_context_check(bp, rxfh, extack); if (rc) return rc; @@ -1953,7 +1959,7 @@ static int bnxt_modify_rxfh_context(struct net_device *dev, struct bnxt_rss_ctx *rss_ctx; int rc; - rc = bnxt_rxfh_context_check(bp, extack); + rc = bnxt_rxfh_context_check(bp, rxfh, extack); if (rc) return rc; -- cgit From 9dbad38336a9c9a6e77df07c6c770ff6cf55c365 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:50 -0700 Subject: eth: bnxt: populate defaults in the RSS context struct As described in the kdoc for .create_rxfh_context we are responsible for populating the defaults. The core will not call .get_rxfh for non-0 context. The problem can be easily observed since Netlink doesn't currently use the cache. Using netlink ethtool: $ ethtool -x eth0 context 1 [...] RSS hash key: 13:60:cd:60:14:d3:55:36:86:df:90:f2:96:14:e2:21:05:57:a8:8f:a5:12:5e:54:62:7f:fd:3c:15:7e:76:05:71:42:a2:9a:73:80:09:9c RSS hash function: toeplitz: on xor: off crc32: off But using IOCTL ethtool shows: $ ./ethtool-old -x eth0 context 1 [...] RSS hash key: 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 RSS hash function: Operation not supported Fixes: 7964e7884643 ("net: ethtool: use the tracking array for get_rxfh on custom RSS contexts") Signed-off-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 0425a54eca98..ab8e3f197e7b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1921,8 +1921,12 @@ static int bnxt_create_rxfh_context(struct net_device *dev, if (rc) goto out; + /* Populate defaults in the context */ bnxt_set_dflt_rss_indir_tbl(bp, ctx); + ctx->hfunc = ETH_RSS_HASH_TOP; memcpy(vnic->rss_hash_key, bp->rss_hash_key, HW_HASH_KEY_SIZE); + memcpy(ethtool_rxfh_context_key(ctx), + bp->rss_hash_key, HW_HASH_KEY_SIZE); rc = bnxt_hwrm_vnic_alloc(bp, vnic, 0, bp->rx_nr_rings); if (rc) { -- cgit From 7195f0ef7f5b8c678cf28de7c9b619cb908b482c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:51 -0700 Subject: ethtool: fix setting key and resetting indir at once The indirection table and the key follow struct ethtool_rxfh in user memory. To reset the indirection table user space calls SET_RXFH with table of size 0 (OTOH to say "no change" it should use -1 / ~0). The logic for calculating the offset where they key sits is incorrect in this case, as kernel would still offset by the full table length, while for the reset there is no indir table and key is immediately after the struct. $ ethtool -X eth0 default hkey 01:02:03... $ ethtool -x eth0 [...] RSS hash key: 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 [...] Fixes: 3de0b592394d ("ethtool: Support for configurable RSS hash key") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 983fee76f5cf..a37ba113610a 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1331,13 +1331,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; u32 dev_indir_size = 0, dev_key_size = 0, i; + u32 user_indir_len = 0, indir_bytes = 0; struct ethtool_rxfh_param rxfh_dev = {}; struct ethtool_rxfh_context *ctx = NULL; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; bool locked = false; /* dev->ethtool->rss_lock taken */ - u32 indir_bytes = 0; bool create = false; u8 *rss_config; int ret; @@ -1400,6 +1400,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { + user_indir_len = indir_bytes; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, @@ -1426,7 +1427,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.key_size = dev_key_size; rxfh_dev.key = rss_config + indir_bytes; if (copy_from_user(rxfh_dev.key, - useraddr + rss_cfg_offset + indir_bytes, + useraddr + rss_cfg_offset + user_indir_len, rxfh.key_size)) { ret = -EFAULT; goto out; -- cgit From dc9755370e1c5965d16dff98c9877f5b1847e367 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:52 -0700 Subject: ethtool: fix the state of additional contexts with old API We expect drivers implementing the new create/modify/destroy API to populate the defaults in struct ethtool_rxfh_context. In legacy API ctx isn't even passed, and rxfh.indir / rxfh.key are NULL so drivers can't give us defaults even if they want to. Call get_rxfh() to fetch the values. We can reuse rxfh_dev for the get_rxfh(), rxfh stores the input from the user. This fixes IOCTL reporting 0s instead of the default key / indir table for drivers using legacy API. Add a check to try to catch drivers using the new API but not populating the key. Fixes: 7964e7884643 ("net: ethtool: use the tracking array for get_rxfh on custom RSS contexts") Signed-off-by: Jakub Kicinski Reviewed-by: Edward Cree Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a37ba113610a..8ca13208d240 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1382,10 +1382,9 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; - if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); + indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); - rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); + rss_config = kzalloc(indir_bytes + dev_key_size, GFP_USER); if (!rss_config) return -ENOMEM; @@ -1475,16 +1474,21 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.input_xfrm = rxfh.input_xfrm; if (rxfh.rss_context && ops->create_rxfh_context) { - if (create) + if (create) { ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack); - else if (rxfh_dev.rss_delete) + /* Make sure driver populates defaults */ + WARN_ON_ONCE(!ret && !rxfh_dev.key && + !memchr_inv(ethtool_rxfh_context_key(ctx), + 0, ctx->key_size)); + } else if (rxfh_dev.rss_delete) { ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context, extack); - else + } else { ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, extack); + } } else { ret = ops->set_rxfh(dev, &rxfh_dev, extack); } @@ -1523,6 +1527,22 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, kfree(ctx); goto out; } + + /* Fetch the defaults for the old API, in the new API drivers + * should write defaults into ctx themselves. + */ + rxfh_dev.indir = (u32 *)rss_config; + rxfh_dev.indir_size = dev_indir_size; + + rxfh_dev.key = rss_config + indir_bytes; + rxfh_dev.key_size = dev_key_size; + + ret = ops->get_rxfh(dev, &rxfh_dev); + if (WARN_ON(ret)) { + xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); + kfree(ctx); + goto out; + } } if (rxfh_dev.rss_delete) { WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx); @@ -1531,12 +1551,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (rxfh_dev.indir) { for (i = 0; i < dev_indir_size; i++) ethtool_rxfh_context_indir(ctx)[i] = rxfh_dev.indir[i]; - ctx->indir_configured = 1; + ctx->indir_configured = + rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE; } if (rxfh_dev.key) { memcpy(ethtool_rxfh_context_key(ctx), rxfh_dev.key, dev_key_size); - ctx->key_configured = 1; + ctx->key_configured = !!rxfh.key_size; } if (rxfh_dev.hfunc != ETH_RSS_HASH_NO_CHANGE) ctx->hfunc = rxfh_dev.hfunc; -- cgit From 0d6ccfe6b319d56da63b7d7cfbcecd92780a680d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jul 2024 15:23:53 -0700 Subject: selftests: drv-net: rss_ctx: check for all-zero keys We had a handful of bugs relating to key being either all 0 or just reported incorrectly as all 0. Check for this in the tests. Signed-off-by: Jakub Kicinski Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/hw/rss_ctx.py | 37 +++++++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 931dbc36ca43..011508ca604b 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -19,6 +19,15 @@ def _rss_key_rand(length): return [random.randint(0, 255) for _ in range(length)] +def _rss_key_check(cfg, data=None, context=0): + if data is None: + data = get_rss(cfg, context=context) + if 'rss-hash-key' not in data: + return + non_zero = [x for x in data['rss-hash-key'] if x != 0] + ksft_eq(bool(non_zero), True, comment=f"RSS key is all zero {data['rss-hash-key']}") + + def get_rss(cfg, context=0): return ethtool(f"-x {cfg.ifname} context {context}", json=True)[0] @@ -90,8 +99,9 @@ def _send_traffic_check(cfg, port, name, params): def test_rss_key_indir(cfg): """Test basics like updating the main RSS key and indirection table.""" - if len(_get_rx_cnts(cfg)) < 2: - KsftSkipEx("Device has only one queue (or doesn't support queue stats)") + qcnt = len(_get_rx_cnts(cfg)) + if qcnt < 3: + KsftSkipEx("Device has fewer than 3 queues (or doesn't support queue stats)") data = get_rss(cfg) want_keys = ['rss-hash-key', 'rss-hash-function', 'rss-indirection-table'] @@ -101,6 +111,7 @@ def test_rss_key_indir(cfg): if not data[k]: raise KsftFailEx(f"ethtool results empty for '{k}': {data[k]}") + _rss_key_check(cfg, data=data) key_len = len(data['rss-hash-key']) # Set the key @@ -110,9 +121,26 @@ def test_rss_key_indir(cfg): data = get_rss(cfg) ksft_eq(key, data['rss-hash-key']) + # Set the indirection table and the key together + key = _rss_key_rand(key_len) + ethtool(f"-X {cfg.ifname} equal 3 hkey " + _rss_key_str(key)) + reset_indir = defer(ethtool, f"-X {cfg.ifname} default") + + data = get_rss(cfg) + _rss_key_check(cfg, data=data) + ksft_eq(0, min(data['rss-indirection-table'])) + ksft_eq(2, max(data['rss-indirection-table'])) + + # Reset indirection table and set the key + key = _rss_key_rand(key_len) + ethtool(f"-X {cfg.ifname} default hkey " + _rss_key_str(key)) + data = get_rss(cfg) + _rss_key_check(cfg, data=data) + ksft_eq(0, min(data['rss-indirection-table'])) + ksft_eq(qcnt - 1, max(data['rss-indirection-table'])) + # Set the indirection table ethtool(f"-X {cfg.ifname} equal 2") - reset_indir = defer(ethtool, f"-X {cfg.ifname} default") data = get_rss(cfg) ksft_eq(0, min(data['rss-indirection-table'])) ksft_eq(1, max(data['rss-indirection-table'])) @@ -317,8 +345,11 @@ def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None): ctx_cnt = i break + _rss_key_check(cfg, context=ctx_id) + if not create_with_cfg: ethtool(f"-X {cfg.ifname} context {ctx_id} {want_cfg}") + _rss_key_check(cfg, context=ctx_id) # Sanity check the context we just created data = get_rss(cfg, ctx_id) -- cgit From 05f76b2d634e65ab34472802d9b142ea9e03f74e Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 26 Jul 2024 13:41:05 -0700 Subject: tcp: Adjust clamping window for applications specifying SO_RCVBUF tp->scaling_ratio is not updated based on skb->len/skb->truesize once SO_RCVBUF is set leading to the maximum window scaling to be 25% of rcvbuf after commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") and 50% of rcvbuf after commit 697a6c8cec03 ("tcp: increase the default TCP scaling ratio"). 50% tries to emulate the behavior of older kernels using sysctl_tcp_adv_win_scale with default value. Systems which were using a different values of sysctl_tcp_adv_win_scale in older kernels ended up seeing reduced download speeds in certain cases as covered in https://lists.openwall.net/netdev/2024/05/15/13 While the sysctl scheme is no longer acceptable, the value of 50% is a bit conservative when the skb->len/skb->truesize ratio is later determined to be ~0.66. Applications not specifying SO_RCVBUF update the window scaling and the receiver buffer every time data is copied to userspace. This computation is now used for applications setting SO_RCVBUF to update the maximum window scaling while ensuring that the receive buffer is within the application specified limit. Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 454362e359da..e2b9583ed96a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -754,8 +754,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * */ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)) { u64 rcvwin, grow; int rcvbuf; @@ -771,12 +770,22 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } + } else { + /* Make the window clamp follow along while being bounded + * by SO_RCVBUF. + */ + int clamp = tcp_win_from_space(sk, min(rcvbuf, sk->sk_rcvbuf)); + + if (clamp > tp->window_clamp) + WRITE_ONCE(tp->window_clamp, clamp); } } tp->rcvq_space.space = copied; -- cgit From 799a829507506924add8a7620493adc1c3cfda30 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Fri, 26 Jul 2024 15:06:50 +0800 Subject: net: axienet: start napi before enabling Rx/Tx softirq may get lost if an Rx interrupt comes before we call napi_enable. Move napi_enable in front of axienet_setoptions(), which turns on the device, to address the issue. Link: https://lists.gnu.org/archive/html/qemu-devel/2024-07/msg06160.html Fixes: cc37610caaf8 ("net: axienet: implement NAPI and GRO receive") Signed-off-by: Andy Chiu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index e342f387c3dd..02fdf66e07fa 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2219,9 +2219,9 @@ static void axienet_dma_err_handler(struct work_struct *work) ~(XAE_OPTION_TXEN | XAE_OPTION_RXEN)); axienet_set_mac_address(ndev, NULL); axienet_set_multicast_list(ndev); - axienet_setoptions(ndev, lp->options); napi_enable(&lp->napi_rx); napi_enable(&lp->napi_tx); + axienet_setoptions(ndev, lp->options); } /** -- cgit From 9415d375d8520e0ed55f0c0b058928da9a5b5b3d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 26 Jul 2024 17:19:53 -0700 Subject: rtnetlink: Don't ignore IFLA_TARGET_NETNSID when ifname is specified in rtnl_dellink(). The cited commit accidentally replaced tgt_net with net in rtnl_dellink(). As a result, IFLA_TARGET_NETNSID is ignored if the interface is specified with IFLA_IFNAME or IFLA_ALT_IFNAME. Let's pass tgt_net to rtnl_dev_get(). Fixes: cc6090e985d7 ("net: rtnetlink: introduce helper to get net_device instance by ifname") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 87e67194f240..73fd7f543fd0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3288,7 +3288,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(tgt_net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else -- cgit From 167b93258d1e2230ee3e8a97669b4db4cc9e90aa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:03:59 +0200 Subject: mptcp: fix user-space PM announced address accounting Currently the per-connection announced address counter is never decreased. When the user-space PM is in use, this just affect the information exposed via diag/sockopt, but it could still foul the PM to wrong decision. Add the missing accounting for the user-space PM's sake. Fixes: 8b1c94da1e48 ("mptcp: only send RM_ADDR in nl_cmd_remove") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ea9e5817b9e9..b399f2b7a369 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1534,16 +1534,25 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; + int anno_nr = 0; list_for_each_entry(entry, rm_list, list) { - if ((remove_anno_list_by_saddr(msk, &entry->addr) || - lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) && - alist.nr < MPTCP_RM_IDS_MAX) - alist.ids[alist.nr++] = entry->addr.id; + if (alist.nr >= MPTCP_RM_IDS_MAX) + break; + + /* only delete if either announced or matching a subflow */ + if (remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!lookup_subflow_by_saddr(&msk->conn_list, + &entry->addr)) + continue; + + alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } -- cgit From 4b317e0eb287bd30a1b329513531157c25e8b692 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:04:00 +0200 Subject: mptcp: fix NL PM announced address accounting Currently the per connection announced address counter is never decreased. As a consequence, after connection establishment, if the NL PM deletes an endpoint and adds a new/different one, no additional subflow is created for the new endpoint even if the current limits allow that. Address the issue properly updating the signaled address counter every time the NL PM removes such addresses. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b399f2b7a369..f65831de5c1a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1401,6 +1401,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= ret; mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } @@ -1565,17 +1566,18 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { - if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - slist.nr < MPTCP_RM_IDS_MAX) + if (slist.nr < MPTCP_RM_IDS_MAX && + lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = entry->addr.id; - if (remove_anno_list_by_saddr(msk, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX) + if (alist.nr < MPTCP_RM_IDS_MAX && + remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } -- cgit From b5e2fb832f48bc01d937a053e0550a1465a2f05d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:04:01 +0200 Subject: selftests: mptcp: add explicit test case for remove/readd Delete and re-create a signal endpoint and ensure that the PM actually deletes and re-create the subflow. Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 108aeeb84ef1..9c091fc267c4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3526,6 +3526,35 @@ endpoint_tests() chk_mptcp_info subflows 1 subflows 1 mptcp_lib_kill_wait $tests_pid fi + + # remove and re-add + if reset "delete re-add signal" && + mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 1 1 + pm_nl_set_limits $ns2 1 1 + pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + test_linkfail=4 speed=20 \ + run_tests $ns1 $ns2 10.0.1.1 & + local tests_pid=$! + + wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns1 10.0.2.1 id 1 flags signal + chk_subflow_nr "before delete" 2 + chk_mptcp_info subflows 1 subflows 1 + + pm_nl_del_endpoint $ns1 1 10.0.2.1 + sleep 0.5 + chk_subflow_nr "after delete" 1 + chk_mptcp_info subflows 0 subflows 0 + + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-add" 2 + chk_mptcp_info subflows 1 subflows 1 + mptcp_lib_kill_wait $tests_pid + fi + } # [$1: error message] -- cgit From 4a2f48992ddf4b8c2fba846c6754089edae6db5a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Jul 2024 11:04:02 +0200 Subject: selftests: mptcp: fix error path pm_nl_check_endpoint() currently calls an not existing helper to mark the test as failed. Fix the wrong call. Fixes: 03668c65d153 ("selftests: mptcp: join: rework detailed report") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9c091fc267c4..55d84a1bde15 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -661,7 +661,7 @@ pm_nl_check_endpoint() done if [ -z "${id}" ]; then - test_fail "bad test - missing endpoint id" + fail_test "bad test - missing endpoint id" return fi -- cgit From 7c70bcc2a84cf925f655ea1ac4b8088062b144a3 Mon Sep 17 00:00:00 2001 From: Liu Jing Date: Sat, 27 Jul 2024 11:04:03 +0200 Subject: selftests: mptcp: always close input's FD if opened In main_loop_s function, when the open(cfg_input, O_RDONLY) function is run, the last fd is not closed if the "--cfg_repeat > 0" branch is not taken. Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Cc: stable@vger.kernel.org Signed-off-by: Liu Jing Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index d2043ec3bf6d..4209b9569039 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1115,11 +1115,11 @@ again: return 1; } - if (--cfg_repeat > 0) { - if (cfg_input) - close(fd); + if (cfg_input) + close(fd); + + if (--cfg_repeat > 0) goto again; - } return 0; } -- cgit From ec145a18687fec8dd97eeb4f30057fa4debef577 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Fri, 26 Jul 2024 20:17:09 +0200 Subject: ice: respect netif readiness in AF_XDP ZC related ndo's Address a scenario in which XSK ZC Tx produces descriptors to XDP Tx ring when link is either not yet fully initialized or process of stopping the netdev has already started. To avoid this, add checks against carrier readiness in ice_xsk_wakeup() and in ice_xmit_zc(). One could argue that bailing out early in ice_xsk_wakeup() would be sufficient but given the fact that we produce Tx descriptors on behalf of NAPI that is triggered for Rx traffic, the latter is also needed. Bringing link up is an asynchronous event executed within ice_service_task so even though interface has been brought up there is still a time frame where link is not yet ok. Without this patch, when AF_XDP ZC Tx is used simultaneously with stack Tx, Tx timeouts occur after going through link flap (admin brings interface down then up again). HW seem to be unable to transmit descriptor to the wire after HW tail register bump which in turn causes bit __QUEUE_STATE_STACK_XOFF to be set forever as netdev_tx_completed_queue() sees no cleaned bytes on the input. Fixes: 126cdfe1007a ("ice: xsk: Improve AF_XDP ZC Tx and use batching API") Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Michal Kubiak Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index a65955eb23c0..72738b8b8a68 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -1048,6 +1048,10 @@ bool ice_xmit_zc(struct ice_tx_ring *xdp_ring) ice_clean_xdp_irq_zc(xdp_ring); + if (!netif_carrier_ok(xdp_ring->vsi->netdev) || + !netif_running(xdp_ring->vsi->netdev)) + return true; + budget = ICE_DESC_UNUSED(xdp_ring); budget = min_t(u16, budget, ICE_RING_QUARTER(xdp_ring)); @@ -1091,7 +1095,7 @@ ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, struct ice_vsi *vsi = np->vsi; struct ice_tx_ring *ring; - if (test_bit(ICE_VSI_DOWN, vsi->state)) + if (test_bit(ICE_VSI_DOWN, vsi->state) || !netif_carrier_ok(netdev)) return -ENETDOWN; if (!ice_is_xdp_ena_vsi(vsi)) -- cgit From 1ff72a2f67791cd4ddad19ed830445f57b30e992 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:10 +0200 Subject: ice: don't busy wait for Rx queue disable in ice_qp_dis() When ice driver is spammed with multiple xdpsock instances and flow control is enabled, there are cases when Rx queue gets stuck and unable to reflect the disable state in QRX_CTRL register. Similar issue has previously been addressed in commit 13a6233b033f ("ice: Add support to enable/disable all Rx queues before waiting"). To workaround this, let us simply not wait for a disabled state as later patch will make sure that regardless of the encountered error in the process of disabling a queue pair, the Rx queue will be enabled. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 72738b8b8a68..3104a5657b83 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -199,10 +199,8 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) if (err) return err; } - err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); - if (err) - return err; + ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, false); ice_qp_clean_rings(vsi, q_idx); ice_qp_reset_stats(vsi, q_idx); -- cgit From 405d9999aa0b4ae467ef391d1d9c7e0d30ad0841 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:11 +0200 Subject: ice: replace synchronize_rcu with synchronize_net Given that ice_qp_dis() is called under rtnl_lock, synchronize_net() can be called instead of synchronize_rcu() so that XDP rings can finish its job in a faster way. Also let us do this as earlier in XSK queue disable flow. Additionally, turn off regular Tx queue before disabling irqs and NAPI. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 3104a5657b83..ba50af9a5929 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -52,10 +52,8 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx) static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx) { ice_clean_tx_ring(vsi->tx_rings[q_idx]); - if (ice_is_xdp_ena_vsi(vsi)) { - synchronize_rcu(); + if (ice_is_xdp_ena_vsi(vsi)) ice_clean_tx_ring(vsi->xdp_rings[q_idx]); - } ice_clean_rx_ring(vsi->rx_rings[q_idx]); } @@ -180,11 +178,12 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) usleep_range(1000, 2000); } + synchronize_net(); + netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + ice_qvec_dis_irq(vsi, rx_ring, q_vector); ice_qvec_toggle_napi(vsi, q_vector, false); - netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); - ice_fill_txq_meta(vsi, tx_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); if (err) -- cgit From d5922717994911e8f0eab736f3ba0d968c158823 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:12 +0200 Subject: ice: modify error handling when setting XSK pool in ndo_bpf Don't bail out right when spotting an error within ice_qp_{dis,ena}() but rather track error and go through whole flow of disabling and enabling queue pair. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index ba50af9a5929..902096b000f5 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -162,6 +162,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) struct ice_tx_ring *tx_ring; struct ice_rx_ring *rx_ring; int timeout = 50; + int fail = 0; int err; if (q_idx >= vsi->num_rxq || q_idx >= vsi->num_txq) @@ -186,8 +187,8 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) ice_fill_txq_meta(vsi, tx_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); - if (err) - return err; + if (!fail) + fail = err; if (ice_is_xdp_ena_vsi(vsi)) { struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_idx]; @@ -195,15 +196,15 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) ice_fill_txq_meta(vsi, xdp_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, xdp_ring, &txq_meta); - if (err) - return err; + if (!fail) + fail = err; } ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, false); ice_qp_clean_rings(vsi, q_idx); ice_qp_reset_stats(vsi, q_idx); - return 0; + return fail; } /** @@ -216,32 +217,33 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) { struct ice_q_vector *q_vector; + int fail = 0; int err; err = ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx); - if (err) - return err; + if (!fail) + fail = err; if (ice_is_xdp_ena_vsi(vsi)) { struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_idx]; err = ice_vsi_cfg_single_txq(vsi, vsi->xdp_rings, q_idx); - if (err) - return err; + if (!fail) + fail = err; ice_set_ring_xdp(xdp_ring); ice_tx_xsk_pool(vsi, q_idx); } err = ice_vsi_cfg_single_rxq(vsi, q_idx); - if (err) - return err; + if (!fail) + fail = err; q_vector = vsi->rx_rings[q_idx]->q_vector; ice_qvec_cfg_msix(vsi, q_vector); err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_idx, true); - if (err) - return err; + if (!fail) + fail = err; ice_qvec_toggle_napi(vsi, q_vector, true); ice_qvec_ena_irq(vsi, q_vector); @@ -249,7 +251,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); clear_bit(ICE_CFG_BUSY, vsi->state); - return 0; + return fail; } /** -- cgit From 9da75a511c5558fa3da56759984fd1fa859186f0 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:13 +0200 Subject: ice: toggle netif_carrier when setting up XSK pool This so we prevent Tx timeout issues. One of conditions checked on running in the background dev_watchdog() is netif_carrier_ok(), so let us turn it off when we disable the queues that belong to a q_vector where XSK pool is being configured. Turn carrier on in ice_qp_ena() only when ice_get_link_status() tells us that physical link is up. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 902096b000f5..3fbe4cfadfbf 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -180,6 +180,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) } synchronize_net(); + netif_carrier_off(vsi->netdev); netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); ice_qvec_dis_irq(vsi, rx_ring, q_vector); @@ -218,6 +219,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) { struct ice_q_vector *q_vector; int fail = 0; + bool link_up; int err; err = ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx); @@ -248,7 +250,11 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) ice_qvec_toggle_napi(vsi, q_vector, true); ice_qvec_ena_irq(vsi, q_vector); - netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + ice_get_link_status(vsi->port_info, &link_up); + if (link_up) { + netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); + netif_carrier_on(vsi->netdev); + } clear_bit(ICE_CFG_BUSY, vsi->state); return fail; -- cgit From ebc33a3f8d0aeddf19fd5827add24b82ae171829 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:14 +0200 Subject: ice: improve updating ice_{t,r}x_ring::xsk_pool xsk_buff_pool pointers that ice ring structs hold are updated via ndo_bpf that is executed in process context while it can be read by remote CPU at the same time within NAPI poll. Use synchronize_net() after pointer update and {READ,WRITE}_ONCE() when working with mentioned pointer. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 11 ++-- drivers/net/ethernet/intel/ice/ice_base.c | 4 +- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- drivers/net/ethernet/intel/ice/ice_txrx.c | 8 ++- drivers/net/ethernet/intel/ice/ice_xsk.c | 103 +++++++++++++++++++----------- drivers/net/ethernet/intel/ice/ice_xsk.h | 14 ++-- 6 files changed, 87 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 99a75a59078e..caaa10157909 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -765,18 +765,17 @@ static inline struct xsk_buff_pool *ice_get_xp_from_qid(struct ice_vsi *vsi, } /** - * ice_xsk_pool - get XSK buffer pool bound to a ring + * ice_rx_xsk_pool - assign XSK buff pool to Rx ring * @ring: Rx ring to use * - * Returns a pointer to xsk_buff_pool structure if there is a buffer pool - * present, NULL otherwise. + * Sets XSK buff pool pointer on Rx ring. */ -static inline struct xsk_buff_pool *ice_xsk_pool(struct ice_rx_ring *ring) +static inline void ice_rx_xsk_pool(struct ice_rx_ring *ring) { struct ice_vsi *vsi = ring->vsi; u16 qid = ring->q_index; - return ice_get_xp_from_qid(vsi, qid); + WRITE_ONCE(ring->xsk_pool, ice_get_xp_from_qid(vsi, qid)); } /** @@ -801,7 +800,7 @@ static inline void ice_tx_xsk_pool(struct ice_vsi *vsi, u16 qid) if (!ring) return; - ring->xsk_pool = ice_get_xp_from_qid(vsi, qid); + WRITE_ONCE(ring->xsk_pool, ice_get_xp_from_qid(vsi, qid)); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 5d396c1a7731..1facf179a96f 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -536,7 +536,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return err; } - ring->xsk_pool = ice_xsk_pool(ring); + ice_rx_xsk_pool(ring); if (ring->xsk_pool) { xdp_rxq_info_unreg(&ring->xdp_rxq); @@ -597,7 +597,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return 0; } - ok = ice_alloc_rx_bufs_zc(ring, num_bufs); + ok = ice_alloc_rx_bufs_zc(ring, ring->xsk_pool, num_bufs); if (!ok) { u16 pf_q = ring->vsi->rxq_map[ring->q_index]; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index ec636be4d17d..3de020020bc4 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2948,7 +2948,7 @@ static void ice_vsi_rx_napi_schedule(struct ice_vsi *vsi) ice_for_each_rxq(vsi, i) { struct ice_rx_ring *rx_ring = vsi->rx_rings[i]; - if (rx_ring->xsk_pool) + if (READ_ONCE(rx_ring->xsk_pool)) napi_schedule(&rx_ring->q_vector->napi); } } diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 8bb743f78fcb..0f91e9167427 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1521,10 +1521,11 @@ int ice_napi_poll(struct napi_struct *napi, int budget) * budget and be more aggressive about cleaning up the Tx descriptors. */ ice_for_each_tx_ring(tx_ring, q_vector->tx) { + struct xsk_buff_pool *xsk_pool = READ_ONCE(tx_ring->xsk_pool); bool wd; - if (tx_ring->xsk_pool) - wd = ice_xmit_zc(tx_ring); + if (xsk_pool) + wd = ice_xmit_zc(tx_ring, xsk_pool); else if (ice_ring_is_xdp(tx_ring)) wd = true; else @@ -1550,6 +1551,7 @@ int ice_napi_poll(struct napi_struct *napi, int budget) budget_per_ring = budget; ice_for_each_rx_ring(rx_ring, q_vector->rx) { + struct xsk_buff_pool *xsk_pool = READ_ONCE(rx_ring->xsk_pool); int cleaned; /* A dedicated path for zero-copy allows making a single @@ -1557,7 +1559,7 @@ int ice_napi_poll(struct napi_struct *napi, int budget) * ice_clean_rx_irq function and makes the codebase cleaner. */ cleaned = rx_ring->xsk_pool ? - ice_clean_rx_irq_zc(rx_ring, budget_per_ring) : + ice_clean_rx_irq_zc(rx_ring, xsk_pool, budget_per_ring) : ice_clean_rx_irq(rx_ring, budget_per_ring); work_done += cleaned; /* if we clean as many as budgeted, we must not be done */ diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 3fbe4cfadfbf..ee084ad80a61 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -250,6 +250,8 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) ice_qvec_toggle_napi(vsi, q_vector, true); ice_qvec_ena_irq(vsi, q_vector); + /* make sure NAPI sees updated ice_{t,x}_ring::xsk_pool */ + synchronize_net(); ice_get_link_status(vsi->port_info, &link_up); if (link_up) { netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); @@ -464,6 +466,7 @@ static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, /** * __ice_alloc_rx_bufs_zc - allocate a number of Rx buffers * @rx_ring: Rx ring + * @xsk_pool: XSK buffer pool to pick buffers to be filled by HW * @count: The number of buffers to allocate * * Place the @count of descriptors onto Rx ring. Handle the ring wrap @@ -472,7 +475,8 @@ static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, * * Returns true if all allocations were successful, false if any fail. */ -static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) +static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count) { u32 nb_buffs_extra = 0, nb_buffs = 0; union ice_32b_rx_flex_desc *rx_desc; @@ -484,8 +488,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) xdp = ice_xdp_buf(rx_ring, ntu); if (ntu + count >= rx_ring->count) { - nb_buffs_extra = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, - rx_desc, + nb_buffs_extra = ice_fill_rx_descs(xsk_pool, xdp, rx_desc, rx_ring->count - ntu); if (nb_buffs_extra != rx_ring->count - ntu) { ntu += nb_buffs_extra; @@ -498,7 +501,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) ice_release_rx_desc(rx_ring, 0); } - nb_buffs = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, rx_desc, count); + nb_buffs = ice_fill_rx_descs(xsk_pool, xdp, rx_desc, count); ntu += nb_buffs; if (ntu == rx_ring->count) @@ -514,6 +517,7 @@ exit: /** * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers * @rx_ring: Rx ring + * @xsk_pool: XSK buffer pool to pick buffers to be filled by HW * @count: The number of buffers to allocate * * Wrapper for internal allocation routine; figure out how many tail @@ -521,7 +525,8 @@ exit: * * Returns true if all calls to internal alloc routine succeeded */ -bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) +bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count) { u16 rx_thresh = ICE_RING_QUARTER(rx_ring); u16 leftover, i, tail_bumps; @@ -530,9 +535,9 @@ bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) leftover = count - (tail_bumps * rx_thresh); for (i = 0; i < tail_bumps; i++) - if (!__ice_alloc_rx_bufs_zc(rx_ring, rx_thresh)) + if (!__ice_alloc_rx_bufs_zc(rx_ring, xsk_pool, rx_thresh)) return false; - return __ice_alloc_rx_bufs_zc(rx_ring, leftover); + return __ice_alloc_rx_bufs_zc(rx_ring, xsk_pool, leftover); } /** @@ -601,8 +606,10 @@ out: /** * ice_clean_xdp_irq_zc - produce AF_XDP descriptors to CQ * @xdp_ring: XDP Tx ring + * @xsk_pool: AF_XDP buffer pool pointer */ -static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) +static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool) { u16 ntc = xdp_ring->next_to_clean; struct ice_tx_desc *tx_desc; @@ -653,7 +660,7 @@ skip: if (xdp_ring->next_to_clean >= cnt) xdp_ring->next_to_clean -= cnt; if (xsk_frames) - xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames); + xsk_tx_completed(xsk_pool, xsk_frames); return completed_frames; } @@ -662,6 +669,7 @@ skip: * ice_xmit_xdp_tx_zc - AF_XDP ZC handler for XDP_TX * @xdp: XDP buffer to xmit * @xdp_ring: XDP ring to produce descriptor onto + * @xsk_pool: AF_XDP buffer pool pointer * * note that this function works directly on xdp_buff, no need to convert * it to xdp_frame. xdp_buff pointer is stored to ice_tx_buf so that cleaning @@ -671,7 +679,8 @@ skip: * was not enough space on XDP ring */ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, - struct ice_tx_ring *xdp_ring) + struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool) { struct skb_shared_info *sinfo = NULL; u32 size = xdp->data_end - xdp->data; @@ -685,7 +694,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, free_space = ICE_DESC_UNUSED(xdp_ring); if (free_space < ICE_RING_QUARTER(xdp_ring)) - free_space += ice_clean_xdp_irq_zc(xdp_ring); + free_space += ice_clean_xdp_irq_zc(xdp_ring, xsk_pool); if (unlikely(!free_space)) goto busy; @@ -705,7 +714,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp, dma_addr_t dma; dma = xsk_buff_xdp_get_dma(xdp); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, size); tx_buf->xdp = xdp; tx_buf->type = ICE_TX_BUF_XSK_TX; @@ -747,12 +756,14 @@ busy: * @xdp: xdp_buff used as input to the XDP program * @xdp_prog: XDP program to run * @xdp_ring: ring to be used for XDP_TX action + * @xsk_pool: AF_XDP buffer pool pointer * * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} */ static int ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring) + struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool) { int err, result = ICE_XDP_PASS; u32 act; @@ -763,7 +774,7 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); if (!err) return ICE_XDP_REDIR; - if (xsk_uses_need_wakeup(rx_ring->xsk_pool) && err == -ENOBUFS) + if (xsk_uses_need_wakeup(xsk_pool) && err == -ENOBUFS) result = ICE_XDP_EXIT; else result = ICE_XDP_CONSUMED; @@ -774,7 +785,7 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, case XDP_PASS: break; case XDP_TX: - result = ice_xmit_xdp_tx_zc(xdp, xdp_ring); + result = ice_xmit_xdp_tx_zc(xdp, xdp_ring, xsk_pool); if (result == ICE_XDP_CONSUMED) goto out_failure; break; @@ -826,14 +837,16 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, /** * ice_clean_rx_irq_zc - consumes packets from the hardware ring * @rx_ring: AF_XDP Rx ring + * @xsk_pool: AF_XDP buffer pool pointer * @budget: NAPI budget * * Returns number of processed packets on success, remaining budget on failure. */ -int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) +int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, + int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; - struct xsk_buff_pool *xsk_pool = rx_ring->xsk_pool; u32 ntc = rx_ring->next_to_clean; u32 ntu = rx_ring->next_to_use; struct xdp_buff *first = NULL; @@ -896,7 +909,8 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) if (ice_is_non_eop(rx_ring, rx_desc)) continue; - xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring); + xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring, + xsk_pool); if (likely(xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))) { xdp_xmit |= xdp_res; } else if (xdp_res == ICE_XDP_EXIT) { @@ -945,7 +959,8 @@ construct_skb: rx_ring->next_to_clean = ntc; entries_to_alloc = ICE_RX_DESC_UNUSED(rx_ring); if (entries_to_alloc > ICE_RING_QUARTER(rx_ring)) - failure |= !ice_alloc_rx_bufs_zc(rx_ring, entries_to_alloc); + failure |= !ice_alloc_rx_bufs_zc(rx_ring, xsk_pool, + entries_to_alloc); ice_finalize_xdp_rx(xdp_ring, xdp_xmit, 0); ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes); @@ -968,17 +983,19 @@ construct_skb: /** * ice_xmit_pkt - produce a single HW Tx descriptor out of AF_XDP descriptor * @xdp_ring: XDP ring to produce the HW Tx descriptor on + * @xsk_pool: XSK buffer pool to pick buffers to be consumed by HW * @desc: AF_XDP descriptor to pull the DMA address and length from * @total_bytes: bytes accumulator that will be used for stats update */ -static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc, +static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool, struct xdp_desc *desc, unsigned int *total_bytes) { struct ice_tx_desc *tx_desc; dma_addr_t dma; - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len); + dma = xsk_buff_raw_get_dma(xsk_pool, desc->addr); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, desc->len); tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_to_use++); tx_desc->buf_addr = cpu_to_le64(dma); @@ -991,10 +1008,13 @@ static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc, /** * ice_xmit_pkt_batch - produce a batch of HW Tx descriptors out of AF_XDP descriptors * @xdp_ring: XDP ring to produce the HW Tx descriptors on + * @xsk_pool: XSK buffer pool to pick buffers to be consumed by HW * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from * @total_bytes: bytes accumulator that will be used for stats update */ -static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *descs, +static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool, + struct xdp_desc *descs, unsigned int *total_bytes) { u16 ntu = xdp_ring->next_to_use; @@ -1004,8 +1024,8 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *de loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { dma_addr_t dma; - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, descs[i].addr); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, descs[i].len); + dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len); tx_desc = ICE_TX_DESC(xdp_ring, ntu++); tx_desc->buf_addr = cpu_to_le64(dma); @@ -1021,37 +1041,41 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *de /** * ice_fill_tx_hw_ring - produce the number of Tx descriptors onto ring * @xdp_ring: XDP ring to produce the HW Tx descriptors on + * @xsk_pool: XSK buffer pool to pick buffers to be consumed by HW * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from * @nb_pkts: count of packets to be send * @total_bytes: bytes accumulator that will be used for stats update */ -static void ice_fill_tx_hw_ring(struct ice_tx_ring *xdp_ring, struct xdp_desc *descs, - u32 nb_pkts, unsigned int *total_bytes) +static void ice_fill_tx_hw_ring(struct ice_tx_ring *xdp_ring, + struct xsk_buff_pool *xsk_pool, + struct xdp_desc *descs, u32 nb_pkts, + unsigned int *total_bytes) { u32 batched, leftover, i; batched = ALIGN_DOWN(nb_pkts, PKTS_PER_BATCH); leftover = nb_pkts & (PKTS_PER_BATCH - 1); for (i = 0; i < batched; i += PKTS_PER_BATCH) - ice_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes); + ice_xmit_pkt_batch(xdp_ring, xsk_pool, &descs[i], total_bytes); for (; i < batched + leftover; i++) - ice_xmit_pkt(xdp_ring, &descs[i], total_bytes); + ice_xmit_pkt(xdp_ring, xsk_pool, &descs[i], total_bytes); } /** * ice_xmit_zc - take entries from XSK Tx ring and place them onto HW Tx ring * @xdp_ring: XDP ring to produce the HW Tx descriptors on + * @xsk_pool: AF_XDP buffer pool pointer * * Returns true if there is no more work that needs to be done, false otherwise */ -bool ice_xmit_zc(struct ice_tx_ring *xdp_ring) +bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, struct xsk_buff_pool *xsk_pool) { - struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs; + struct xdp_desc *descs = xsk_pool->tx_descs; u32 nb_pkts, nb_processed = 0; unsigned int total_bytes = 0; int budget; - ice_clean_xdp_irq_zc(xdp_ring); + ice_clean_xdp_irq_zc(xdp_ring, xsk_pool); if (!netif_carrier_ok(xdp_ring->vsi->netdev) || !netif_running(xdp_ring->vsi->netdev)) @@ -1060,25 +1084,26 @@ bool ice_xmit_zc(struct ice_tx_ring *xdp_ring) budget = ICE_DESC_UNUSED(xdp_ring); budget = min_t(u16, budget, ICE_RING_QUARTER(xdp_ring)); - nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, budget); + nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget); if (!nb_pkts) return true; if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) { nb_processed = xdp_ring->count - xdp_ring->next_to_use; - ice_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes); + ice_fill_tx_hw_ring(xdp_ring, xsk_pool, descs, nb_processed, + &total_bytes); xdp_ring->next_to_use = 0; } - ice_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed, - &total_bytes); + ice_fill_tx_hw_ring(xdp_ring, xsk_pool, &descs[nb_processed], + nb_pkts - nb_processed, &total_bytes); ice_set_rs_bit(xdp_ring); ice_xdp_ring_update_tail(xdp_ring); ice_update_tx_ring_stats(xdp_ring, nb_pkts, total_bytes); - if (xsk_uses_need_wakeup(xdp_ring->xsk_pool)) - xsk_set_tx_need_wakeup(xdp_ring->xsk_pool); + if (xsk_uses_need_wakeup(xsk_pool)) + xsk_set_tx_need_wakeup(xsk_pool); return nb_pkts < budget; } @@ -1111,7 +1136,7 @@ ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, ring = vsi->rx_rings[queue_id]->xdp_ring; - if (!ring->xsk_pool) + if (!READ_ONCE(ring->xsk_pool)) return -EINVAL; /* The idea here is that if NAPI is running, mark a miss, so diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h index 6fa181f080ef..45adeb513253 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.h +++ b/drivers/net/ethernet/intel/ice/ice_xsk.h @@ -20,16 +20,20 @@ struct ice_vsi; #ifdef CONFIG_XDP_SOCKETS int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid); -int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget); +int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, + int budget); int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags); -bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count); +bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count); bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi); void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring); void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring); -bool ice_xmit_zc(struct ice_tx_ring *xdp_ring); +bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, struct xsk_buff_pool *xsk_pool); int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc); #else -static inline bool ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring) +static inline bool ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring, + struct xsk_buff_pool __always_unused *xsk_pool) { return false; } @@ -44,6 +48,7 @@ ice_xsk_pool_setup(struct ice_vsi __always_unused *vsi, static inline int ice_clean_rx_irq_zc(struct ice_rx_ring __always_unused *rx_ring, + struct xsk_buff_pool __always_unused *xsk_pool, int __always_unused budget) { return 0; @@ -51,6 +56,7 @@ ice_clean_rx_irq_zc(struct ice_rx_ring __always_unused *rx_ring, static inline bool ice_alloc_rx_bufs_zc(struct ice_rx_ring __always_unused *rx_ring, + struct xsk_buff_pool __always_unused *xsk_pool, u16 __always_unused count) { return false; -- cgit From 6044ca26210ba72b3dcc649fae1cbedd9e6ab018 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:15 +0200 Subject: ice: add missing WRITE_ONCE when clearing ice_rx_ring::xdp_prog It is read by data path and modified from process context on remote cpu so it is needed to use WRITE_ONCE to clear the pointer. Fixes: efc2214b6047 ("ice: Add support for XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 0f91e9167427..8d25b6981269 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -456,7 +456,7 @@ void ice_free_rx_ring(struct ice_rx_ring *rx_ring) if (rx_ring->vsi->type == ICE_VSI_PF) if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); - rx_ring->xdp_prog = NULL; + WRITE_ONCE(rx_ring->xdp_prog, NULL); if (rx_ring->xsk_pool) { kfree(rx_ring->xdp_buf); rx_ring->xdp_buf = NULL; -- cgit From 963fb4612295a5c35b1b89c8bff3bdd4f9127af6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 26 Jul 2024 20:17:16 +0200 Subject: ice: xsk: fix txq interrupt mapping ice_cfg_txq_interrupt() internally handles XDP Tx ring. Do not use ice_for_each_tx_ring() in ice_qvec_cfg_msix() as this causing us to treat XDP ring that belongs to queue vector as Tx ring and therefore misconfiguring the interrupts. Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Reviewed-by: Shannon Nelson Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index ee084ad80a61..240a7bec242b 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -110,25 +110,29 @@ ice_qvec_dis_irq(struct ice_vsi *vsi, struct ice_rx_ring *rx_ring, * ice_qvec_cfg_msix - Enable IRQ for given queue vector * @vsi: the VSI that contains queue vector * @q_vector: queue vector + * @qid: queue index */ static void -ice_qvec_cfg_msix(struct ice_vsi *vsi, struct ice_q_vector *q_vector) +ice_qvec_cfg_msix(struct ice_vsi *vsi, struct ice_q_vector *q_vector, u16 qid) { u16 reg_idx = q_vector->reg_idx; struct ice_pf *pf = vsi->back; struct ice_hw *hw = &pf->hw; - struct ice_tx_ring *tx_ring; - struct ice_rx_ring *rx_ring; + int q, _qid = qid; ice_cfg_itr(hw, q_vector); - ice_for_each_tx_ring(tx_ring, q_vector->tx) - ice_cfg_txq_interrupt(vsi, tx_ring->reg_idx, reg_idx, - q_vector->tx.itr_idx); + for (q = 0; q < q_vector->num_ring_tx; q++) { + ice_cfg_txq_interrupt(vsi, _qid, reg_idx, q_vector->tx.itr_idx); + _qid++; + } - ice_for_each_rx_ring(rx_ring, q_vector->rx) - ice_cfg_rxq_interrupt(vsi, rx_ring->reg_idx, reg_idx, - q_vector->rx.itr_idx); + _qid = qid; + + for (q = 0; q < q_vector->num_ring_rx; q++) { + ice_cfg_rxq_interrupt(vsi, _qid, reg_idx, q_vector->rx.itr_idx); + _qid++; + } ice_flush(hw); } @@ -241,7 +245,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) fail = err; q_vector = vsi->rx_rings[q_idx]->q_vector; - ice_qvec_cfg_msix(vsi, q_vector); + ice_qvec_cfg_msix(vsi, q_vector, q_idx); err = ice_vsi_ctrl_one_rx_ring(vsi, true, q_idx, true); if (!fail) -- cgit From 478574370bef7951fbd9ef5155537d6cbed49472 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 22 Jul 2024 16:49:45 -0700 Subject: btrfs: make cow_file_range_inline() honor locked_page on error The btrfs buffered write path runs through __extent_writepage() which has some tricky return value handling for writepage_delalloc(). Specifically, when that returns 1, we exit, but for other return values we continue and end up calling btrfs_folio_end_all_writers(). If the folio has been unlocked (note that we check the PageLocked bit at the start of __extent_writepage()), this results in an assert panic like this one from syzbot: BTRFS: error (device loop0 state EAL) in free_log_tree:3267: errno=-5 IO failure BTRFS warning (device loop0 state EAL): Skipping commit of aborted transaction. BTRFS: error (device loop0 state EAL) in cleanup_transaction:2018: errno=-5 IO failure assertion failed: folio_test_locked(folio), in fs/btrfs/subpage.c:871 ------------[ cut here ]------------ kernel BUG at fs/btrfs/subpage.c:871! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 PID: 5090 Comm: syz-executor225 Not tainted 6.10.0-syzkaller-05505-gb1bc554e009e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 RIP: 0010:btrfs_folio_end_all_writers+0x55b/0x610 fs/btrfs/subpage.c:871 Code: e9 d3 fb ff ff e8 25 22 c2 fd 48 c7 c7 c0 3c 0e 8c 48 c7 c6 80 3d 0e 8c 48 c7 c2 60 3c 0e 8c b9 67 03 00 00 e8 66 47 ad 07 90 <0f> 0b e8 6e 45 b0 07 4c 89 ff be 08 00 00 00 e8 21 12 25 fe 4c 89 RSP: 0018:ffffc900033d72e0 EFLAGS: 00010246 RAX: 0000000000000045 RBX: 00fff0000000402c RCX: 663b7a08c50a0a00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffffc900033d73b0 R08: ffffffff8176b98c R09: 1ffff9200067adfc R10: dffffc0000000000 R11: fffff5200067adfd R12: 0000000000000001 R13: dffffc0000000000 R14: 0000000000000000 R15: ffffea0001cbee80 FS: 0000000000000000(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5f076012f8 CR3: 000000000e134000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __extent_writepage fs/btrfs/extent_io.c:1597 [inline] extent_write_cache_pages fs/btrfs/extent_io.c:2251 [inline] btrfs_writepages+0x14d7/0x2760 fs/btrfs/extent_io.c:2373 do_writepages+0x359/0x870 mm/page-writeback.c:2656 filemap_fdatawrite_wbc+0x125/0x180 mm/filemap.c:397 __filemap_fdatawrite_range mm/filemap.c:430 [inline] __filemap_fdatawrite mm/filemap.c:436 [inline] filemap_flush+0xdf/0x130 mm/filemap.c:463 btrfs_release_file+0x117/0x130 fs/btrfs/file.c:1547 __fput+0x24a/0x8a0 fs/file_table.c:422 task_work_run+0x24f/0x310 kernel/task_work.c:222 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0xa2f/0x27f0 kernel/exit.c:877 do_group_exit+0x207/0x2c0 kernel/exit.c:1026 __do_sys_exit_group kernel/exit.c:1037 [inline] __se_sys_exit_group kernel/exit.c:1035 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1035 x64_sys_call+0x2634/0x2640 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f075b70c9 Code: Unable to access opcode bytes at 0x7f5f075b709f. I was hitting the same issue by doing hundreds of accelerated runs of generic/475, which also hits IO errors by design. I instrumented that reproducer with bpftrace and found that the undesirable folio_unlock was coming from the following callstack: folio_unlock+5 __process_pages_contig+475 cow_file_range_inline.constprop.0+230 cow_file_range+803 btrfs_run_delalloc_range+566 writepage_delalloc+332 __extent_writepage # inlined in my stacktrace, but I added it here extent_write_cache_pages+622 Looking at the bisected-to patch in the syzbot report, Josef realized that the logic of the cow_file_range_inline error path subtly changing. In the past, on error, it jumped to out_unlock in cow_file_range(), which honors the locked_page, so when we ultimately call folio_end_all_writers(), the folio of interest is still locked. After the change, we always unlocked ignoring the locked_page, on both success and error. On the success path, this all results in returning 1 to __extent_writepage(), which skips the folio_end_all_writers() call, which makes it OK to have unlocked. Fix the bug by wiring the locked_page into cow_file_range_inline() and only setting locked_page to NULL on success. Reported-by: syzbot+a14d8ac9af3a2a4fd0c8@syzkaller.appspotmail.com Fixes: 0586d0a89e77 ("btrfs: move extent bit and page cleanup into cow_file_range_inline") CC: stable@vger.kernel.org # 6.10+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8f38eefc8acd..8ca3878348ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -714,8 +714,9 @@ out: return ret; } -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset, - u64 end, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, + struct page *locked_page, + u64 offset, u64 end, size_t compressed_size, int compress_type, struct folio *compressed_folio, @@ -739,7 +740,10 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset, return ret; } - extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached, + if (ret == 0) + locked_page = NULL; + + extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); @@ -1043,10 +1047,10 @@ again: * extent for the subpage case. */ if (total_in < actual_end) - ret = cow_file_range_inline(inode, start, end, 0, + ret = cow_file_range_inline(inode, NULL, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); else - ret = cow_file_range_inline(inode, start, end, total_compressed, + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, compress_type, folios[0], false); if (ret <= 0) { if (ret < 0) @@ -1359,7 +1363,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, + ret = cow_file_range_inline(inode, locked_page, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* -- cgit From d89c285d28491d8f10534c262ac9e6bdcbe1b4d2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 11 Jul 2024 23:50:58 +0900 Subject: btrfs: do not subtract delalloc from avail bytes The block group's avail bytes printed when dumping a space info subtract the delalloc_bytes. However, as shown in btrfs_add_reserved_bytes() and btrfs_free_reserved_bytes(), it is added or subtracted along with "reserved" for the delalloc case, which means the "delalloc_bytes" is a part of the "reserved" bytes. So, excluding it to calculate the avail space counts delalloc_bytes twice, which can lead to an invalid result. Fixes: e50b122b832b ("btrfs: print available space for a block group when dumping a space info") CC: stable@vger.kernel.org # 6.6+ Signed-off-by: Naohiro Aota Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9ac94d3119e8..c1d9d3664400 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -583,8 +583,7 @@ again: spin_lock(&cache->lock); avail = cache->length - cache->used - cache->pinned - - cache->reserved - cache->delalloc_bytes - - cache->bytes_super - cache->zone_unusable; + cache->reserved - cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, -- cgit From 8cd44dd1d17a23d5cc8c443c659ca57aa76e2fa5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 15 Feb 2023 09:18:02 +0900 Subject: btrfs: zoned: fix zone_unusable accounting on making block group read-write again When btrfs makes a block group read-only, it adds all free regions in the block group to space_info->bytes_readonly. That free space excludes reserved and pinned regions. OTOH, when btrfs makes the block group read-write again, it moves all the unused regions into the block group's zone_unusable. That unused region includes reserved and pinned regions. As a result, it counts too much zone_unusable bytes. Fortunately (or unfortunately), having erroneous zone_unusable does not affect the calculation of space_info->bytes_readonly, because free space (num_bytes in btrfs_dec_block_group_ro) calculation is done based on the erroneous zone_unusable and it reduces the num_bytes just to cancel the error. This behavior can be easily discovered by adding a WARN_ON to check e.g, "bg->pinned > 0" in btrfs_dec_block_group_ro(), and running fstests test case like btrfs/282. Fix it by properly considering pinned and reserved in btrfs_dec_block_group_ro(). Also, add a WARN_ON and introduce btrfs_space_info_update_bytes_zone_unusable() to catch a similar mistake. Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones") CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Naohiro Aota Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 ++++++++----- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/free-space-cache.c | 4 +++- fs/btrfs/space-info.c | 2 +- fs/btrfs/space-info.h | 1 + include/trace/events/btrfs.h | 8 ++++++++ 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 498442d0c216..2e49d978f504 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1223,8 +1223,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - block_group->space_info->bytes_zone_unusable -= - block_group->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, + -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1396,7 +1396,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - sinfo->bytes_zone_unusable -= cache->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, + -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -3056,9 +3057,11 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ cache->zone_unusable = - (cache->alloc_offset - cache->used) + + (cache->alloc_offset - cache->used - cache->pinned - + cache->reserved) + (cache->length - cache->zone_capacity); - sinfo->bytes_zone_unusable += cache->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, + cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d77498e7671c..ff9f0d41987e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2793,7 +2793,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - space_info->bytes_zone_unusable += len; + btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, + len); readonly = true; } spin_unlock(&cache->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3f9b7507543a..f5996a43db24 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2723,8 +2723,10 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, * If the block group is read-only, we should account freed space into * bytes_readonly. */ - if (!block_group->ro) + if (!block_group->ro) { block_group->zone_unusable += to_unusable; + WARN_ON(block_group->zone_unusable > block_group->length); + } spin_unlock(&ctl->tree_lock); if (!used) { spin_lock(&block_group->lock); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c1d9d3664400..68e14fd48638 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -316,7 +316,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; - found->bytes_zone_unusable += block_group->zone_unusable; + btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 4db8a0267c16..88b44221ce97 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -249,6 +249,7 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); +DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index eeb56975bee7..de55a555d95b 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2383,6 +2383,14 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, TP_ARGS(fs_info, sinfo, old, diff) ); +DEFINE_EVENT(btrfs__space_info_update, update_bytes_zone_unusable, + + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, u64 old, s64 diff), + + TP_ARGS(fs_info, sinfo, old, diff) +); + DECLARE_EVENT_CLASS(btrfs_raid56_bio, TP_PROTO(const struct btrfs_raid_bio *rbio, -- cgit From 939b656bc8ab203fdbde26ccac22bcb7f0985be5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 26 Jul 2024 11:12:52 +0100 Subject: btrfs: fix corruption after buffer fault in during direct IO append write During an append (O_APPEND write flag) direct IO write if the input buffer was not previously faulted in, we can corrupt the file in a way that the final size is unexpected and it includes an unexpected hole. The problem happens like this: 1) We have an empty file, with size 0, for example; 2) We do an O_APPEND direct IO with a length of 4096 bytes and the input buffer is not currently faulted in; 3) We enter btrfs_direct_write(), lock the inode and call generic_write_checks(), which calls generic_write_checks_count(), and that function sets the iocb position to 0 with the following code: if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); 4) We call btrfs_dio_write() and enter into iomap, which will end up calling btrfs_dio_iomap_begin() and that calls btrfs_get_blocks_direct_write(), where we update the i_size of the inode to 4096 bytes; 5) After btrfs_dio_iomap_begin() returns, iomap will attempt to access the page of the write input buffer (at iomap_dio_bio_iter(), with a call to bio_iov_iter_get_pages()) and fail with -EFAULT, which gets returned to btrfs at btrfs_direct_write() via btrfs_dio_write(); 6) At btrfs_direct_write() we get the -EFAULT error, unlock the inode, fault in the write buffer and then goto to the label 'relock'; 7) We lock again the inode, do all the necessary checks again and call again generic_write_checks(), which calls generic_write_checks_count() again, and there we set the iocb's position to 4K, which is the current i_size of the inode, with the following code pointed above: if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); 8) Then we go again to btrfs_dio_write() and enter iomap and the write succeeds, but it wrote to the file range [4K, 8K), leaving a hole in the [0, 4K) range and an i_size of 8K, which goes against the expectations of having the data written to the range [0, 4K) and get an i_size of 4K. Fix this by not unlocking the inode before faulting in the input buffer, in case we get -EFAULT or an incomplete write, and not jumping to the 'relock' label after faulting in the buffer - instead jump to a location immediately before calling iomap, skipping all the write checks and relocking. This solves this problem and it's fine even in case the input buffer is memory mapped to the same file range, since only holding the range locked in the inode's io tree can cause a deadlock, it's safe to keep the inode lock (VFS lock), as was fixed and described in commit 51bd9563b678 ("btrfs: fix deadlock due to page faults during direct IO reads and writes"). A sample reproducer provided by a reporter is the following: $ cat test.c #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include int main(int argc, char *argv[]) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } int fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT | O_APPEND, 0644); if (fd < 0) { perror("creating test file"); return 1; } char *buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ssize_t ret = write(fd, buf, 4096); if (ret < 0) { perror("pwritev2"); return 1; } struct stat stbuf; ret = fstat(fd, &stbuf); if (ret < 0) { perror("stat"); return 1; } printf("size: %llu\n", (unsigned long long)stbuf.st_size); return stbuf.st_size == 4096 ? 0 : 1; } A test case for fstests will be sent soon. Reported-by: Hanna Czenczek Link: https://lore.kernel.org/linux-btrfs/0b841d46-12fe-4e64-9abb-871d8d0de271@redhat.com/ Fixes: 8184620ae212 ("btrfs: fix lost file sync on direct IO write with nowait and dsync iocb") CC: stable@vger.kernel.org # 6.1+ Tested-by: Hanna Czenczek Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/direct-io.c | 38 ++++++++++++++++++++++++++++---------- fs/btrfs/file.c | 17 ++++++++++++++--- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8568b1a61c4..75fa563e4cac 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -459,6 +459,7 @@ struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; + bool fsync_skip_inode_lock; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index f9fb2db6a1e4..67adbe9d294a 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -856,21 +856,37 @@ relock: * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ +again: from->nofault = true; dio = btrfs_dio_write(iocb, from, written); from->nofault = false; - /* - * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync - * iocb, and that needs to lock the inode. So unlock it before calling - * iomap_dio_complete() to avoid a deadlock. - */ - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - - if (IS_ERR_OR_NULL(dio)) + if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); - else + } else { + struct btrfs_file_private stack_private = { 0 }; + struct btrfs_file_private *private; + const bool have_private = (file->private_data != NULL); + + if (!have_private) + file->private_data = &stack_private; + + /* + * If we have a synchronous write, we must make sure the fsync + * triggered by the iomap_dio_complete() call below doesn't + * deadlock on the inode lock - we are already holding it and we + * can't call it after unlocking because we may need to complete + * partial writes due to the input buffer (or parts of it) not + * being already faulted in. + */ + private = file->private_data; + private->fsync_skip_inode_lock = true; ret = iomap_dio_complete(dio); + private->fsync_skip_inode_lock = false; + + if (!have_private) + file->private_data = NULL; + } /* No increment (+=) because iomap returns a cumulative value. */ if (ret > 0) @@ -897,10 +913,12 @@ relock: } else { fault_in_iov_iter_readable(from, left); prev_left = left; - goto relock; + goto again; } } + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + /* * If 'ret' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 21381de906f6..9f10a9f23fcc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,6 +1603,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + struct btrfs_file_private *private = file->private_data; struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -1612,6 +1613,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; u64 len; bool full_sync; + const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); trace_btrfs_sync_file(file, datasync); @@ -1639,7 +1641,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + down_write(&inode->i_mmap_lock); + else + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -1663,7 +1668,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -1788,7 +1796,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); -- cgit From 99d3bf5f7377d42f8be60a6b9cb60fb0be34dceb Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 29 Jul 2024 21:51:30 +0900 Subject: Input: MT - limit max slots syzbot is reporting too large allocation at input_mt_init_slots(), for num_slots is supplied from userspace using ioctl(UI_DEV_CREATE). Since nobody knows possible max slots, this patch chose 1024. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=0122fa359a69694395d5 Suggested-by: Dmitry Torokhov Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- drivers/input/input-mt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c index 14b53dac1253..6b04a674f832 100644 --- a/drivers/input/input-mt.c +++ b/drivers/input/input-mt.c @@ -46,6 +46,9 @@ int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots, return 0; if (mt) return mt->num_slots != num_slots ? -EINVAL : 0; + /* Arbitrary limit for avoiding too large memory allocation. */ + if (num_slots > 1024) + return -EINVAL; mt = kzalloc(struct_size(mt, slots, num_slots), GFP_KERNEL); if (!mt) -- cgit From 7c51f7bbf057f82aeba3390c39ef61b244181c09 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Jul 2024 19:59:57 +0900 Subject: profiling: remove prof_cpu_mask syzbot is reporting uninit-value at profile_hits(), for there is a race window between if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; cpumask_copy(prof_cpu_mask, cpu_possible_mask); in profile_init() and cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) in profile_tick(); prof_cpu_mask remains uninitialzed until cpumask_copy() completes while cpumask_available(prof_cpu_mask) returns true as soon as alloc_cpumask_var(&prof_cpu_mask) completes. We could replace alloc_cpumask_var() with zalloc_cpumask_var() and call cpumask_copy() from create_proc_profile() on only UP kernels, for profile_online_cpu() calls cpumask_set_cpu() as needed via cpuhp_setup_state(CPUHP_AP_ONLINE_DYN) on SMP kernels. But this patch removes prof_cpu_mask because it seems unnecessary. The cpumask_test_cpu(smp_processor_id(), prof_cpu_mask) test in profile_tick() is likely always true due to a CPU cannot call profile_tick() if that CPU is offline and cpumask_set_cpu(cpu, prof_cpu_mask) is called when that CPU becomes online and cpumask_clear_cpu(cpu, prof_cpu_mask) is called when that CPU becomes offline . This test could be false during transition between online and offline. But according to include/linux/cpuhotplug.h , CPUHP_PROFILE_PREPARE belongs to PREPARE section, which means that the CPU subjected to profile_dead_cpu() cannot be inside profile_tick() (i.e. no risk of use-after-free bug) because interrupt for that CPU is disabled during PREPARE section. Therefore, this test is guaranteed to be true, and can be removed. (Since profile_hits() checks prof_buffer != NULL, we don't need to check prof_buffer != NULL here unless get_irq_regs() or user_mode() is such slow that we want to avoid when prof_buffer == NULL). do_profile_hits() is called from profile_tick() from timer interrupt only if cpumask_test_cpu(smp_processor_id(), prof_cpu_mask) is true and prof_buffer is not NULL. But syzbot is also reporting that sometimes do_profile_hits() is called while current thread is still doing vzalloc(), where prof_buffer must be NULL at this moment. This indicates that multiple threads concurrently tried to write to /sys/kernel/profiling interface, which caused that somebody else try to re-allocate prof_buffer despite somebody has already allocated prof_buffer. Fix this by using serialization. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=b1a83ab2a9eb9321fbdd Signed-off-by: Tetsuo Handa Tested-by: syzbot Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 7 +++++++ kernel/profile.c | 46 ++++++---------------------------------------- 2 files changed, 13 insertions(+), 40 deletions(-) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42b..1bab21b4718f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj, const char *buf, size_t count) { int ret; + static DEFINE_MUTEX(lock); + /* + * We need serialization, for profile_setup() initializes prof_on + * value and profile_init() must not reallocate prof_buffer after + * once allocated. + */ + guard(mutex)(&lock); if (prof_on) return -EEXIST; /* diff --git a/kernel/profile.c b/kernel/profile.c index 2b775cc5c28f..4654c6cd984e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,7 +47,6 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_var_t prof_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -114,11 +113,6 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); - if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; @@ -132,7 +126,6 @@ int __ref profile_init(void) if (prof_buffer) return 0; - free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -267,9 +260,6 @@ static int profile_dead_cpu(unsigned int cpu) struct page *page; int i; - if (cpumask_available(prof_cpu_mask)) - cpumask_clear_cpu(cpu, prof_cpu_mask); - for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); @@ -302,14 +292,6 @@ static int profile_prepare_cpu(unsigned int cpu) return 0; } -static int profile_online_cpu(unsigned int cpu) -{ - if (cpumask_available(prof_cpu_mask)) - cpumask_set_cpu(cpu, prof_cpu_mask); - - return 0; -} - #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) @@ -334,8 +316,8 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && - cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) + /* This is the old kernel-only legacy profiling */ + if (!user_mode(regs)) profile_hit(type, (void *)profile_pc(regs)); } @@ -418,10 +400,6 @@ static const struct proc_ops profile_proc_ops = { int __ref create_proc_profile(void) { struct proc_dir_entry *entry; -#ifdef CONFIG_SMP - enum cpuhp_state online_state; -#endif - int err = 0; if (!prof_on) @@ -431,26 +409,14 @@ int __ref create_proc_profile(void) profile_prepare_cpu, profile_dead_cpu); if (err) return err; - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", - profile_online_cpu, NULL); - if (err < 0) - goto err_state_prep; - online_state = err; - err = 0; #endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); - if (!entry) - goto err_state_onl; - proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - - return err; -err_state_onl: + if (entry) + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); #ifdef CONFIG_SMP - cpuhp_remove_state(online_state); -err_state_prep: - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); + else + cpuhp_remove_state(CPUHP_PROFILE_PREPARE); #endif return err; } -- cgit From 2accfdb7eff65f390c4308b0e9cb7c3fe48ad63c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 10:58:28 -0700 Subject: profiling: attempt to remove per-cpu profile flip buffer This is the really old legacy kernel profiling code, which has long since been obviated by "real profiling" (ie 'prof' and company), and mainly remains as a source of syzbot reports. There are anecdotal reports that people still use it for boot-time profiling, but it's unlikely that such use would care about the old NUMA optimizations in this code from 2004 (commit ad02973d42: "profile: 512x Altix timer interrupt livelock fix" in the BK import archive at [1]) So in order to head off future syzbot reports, let's try to simplify this code and get rid of the per-cpu profile buffers that are quite a large portion of the complexity footprint of this thing (including CPU hotplug callbacks etc). It's unlikely anybody will actually notice, or possibly, as Thomas put it: "Only people who indulge in nostalgia will notice :)". That said, if it turns out that this code is actually actively used by somebody, we can always revert this removal. Thus the "attempt" in the summary line. [ Note: in a small nod to "the profiling code can cause NUMA problems", this also removes the "increment the last entry in the profiling array on any unknown hits" logic. That would account any program counter in a module to that single counter location, and might exacerbate any NUMA cacheline bouncing issues ] Link: https://lore.kernel.org/all/CAHk-=wgs52BxT4Zjmjz8aNvHWKxf5_ThBY4bYL1Y6CTaNL2dTw@mail.gmail.com/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1] Cc: Thomas Gleixner Cc: Tetsuo Handa Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 1 - kernel/profile.c | 183 +-------------------------------------------- 2 files changed, 2 insertions(+), 182 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..affdd890899e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -100,7 +100,6 @@ enum cpuhp_state { CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, - CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, diff --git a/kernel/profile.c b/kernel/profile.c index 4654c6cd984e..2dfb0f4e755c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -129,180 +129,13 @@ int __ref profile_init(void) return -ENOMEM; } -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- nyc - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int profile_dead_cpu(unsigned int cpu) -{ - struct page *page; - int i; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); - per_cpu(cpu_profile_hits, cpu)[i] = NULL; - __free_page(page); - } - } - return 0; -} - -static int profile_prepare_cpu(unsigned int cpu) -{ - int i, node = cpu_to_mem(cpu); - struct page *page; - - per_cpu(cpu_profile_flip, cpu) = 0; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) - continue; - - page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { - profile_dead_cpu(cpu); - return -ENOMEM; - } - per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); - - } - return 0; -} - -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) - static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); + if (pc < prof_len) + atomic_add(nr_hits, &prof_buffer[pc]); } -#endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -340,7 +173,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) char *pnt; unsigned long sample_step = 1UL << prof_shift; - profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) @@ -386,7 +218,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return -EINVAL; } #endif - profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } @@ -404,20 +235,10 @@ int __ref create_proc_profile(void) if (!prof_on) return 0; -#ifdef CONFIG_SMP - err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", - profile_prepare_cpu, profile_dead_cpu); - if (err) - return err; -#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); if (entry) proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); -#ifdef CONFIG_SMP - else - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); -#endif return err; } subsys_initcall(create_proc_profile); -- cgit From cec6937dd1aae1b38d147bd190cb895d06cf96d0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 12:05:06 -0700 Subject: task_work: make TWA_NMI_CURRENT handling conditional on IRQ_WORK The TWA_NMI_CURRENT handling very much depends on IRQ_WORK, but that isn't universally enabled everywhere. Maybe the IRQ_WORK infrastructure should just be unconditional - x86 ends up indirectly enabling it through unconditionally enabling PERF_EVENTS, for example. But it also gets enabled by having SMP support, or even if you just have PRINTK enabled. But in the meantime TWA_NMI_CURRENT causes tons of build failures on various odd minimal configs. Which did show up in linux-next, but despite that nobody bothered to fix it or even inform me until -rc1 was out. Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify mode") Reported-by: Naresh Kamboju Reported-by: kernelci.org bot Reported-by: Guenter Roeck Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Signed-off-by: Linus Torvalds --- kernel/task_work.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c2daa7ad3f9..5d14d639ac71 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -6,12 +6,14 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif /** * task_work_add - ask the @task to execute @work->func() @@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work, if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; + if (!IS_ENABLED(CONFIG_IRQ_WORK)) + return -EINVAL; } else { /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); @@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; +#ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; +#endif default: WARN_ON_ONCE(1); break; -- cgit From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2024 14:40:29 -0700 Subject: selftests/bpf: Filter out _GNU_SOURCE when compiling test_cpp Jakub reports build failures when merging linux/master with net tree: CXX test_cpp In file included from :454: :2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] 2 | #define _GNU_SOURCE | ^ :445:9: note: previous definition is here 445 | #define _GNU_SOURCE 1 The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. Apparently clang++ also unconditionally adds it for the C++ targets [0] which causes a conflict. Add small change in the selftests makefile to filter it out for test_cpp. Not sure which tree it should go via, targeting bpf for now, but net might be better? 0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd49c1d23a60..81d4757ecd4c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) -- cgit From 94ede2a3e9135764736221c080ac7c0ad993dc2d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 Jul 2024 16:34:17 -0700 Subject: profiling: remove stale percpu flip buffer variables For some reason I didn't see this issue on my arm64 or x86-64 builds, but Stephen Rothwell reports that commit 2accfdb7eff6 ("profiling: attempt to remove per-cpu profile flip buffer") left these static variables around, and the powerpc build is unhappy about them: kernel/profile.c:52:28: warning: 'cpu_profile_flip' defined but not used [-Wunused-variable] 52 | static DEFINE_PER_CPU(int, cpu_profile_flip); | ^~~~~~~~~~~~~~~~ .. So remove these stale left-over remnants too. Fixes: 2accfdb7eff6 ("profiling: attempt to remove per-cpu profile flip buffer") Reported-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index 2dfb0f4e755c..ff68d3816182 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,12 +47,6 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - int profile_setup(char *str) { static const char schedstr[] = "schedule"; -- cgit From b6a66e521a2032f7fcba2af5a9bcbaeaa19b7ca3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:23 +0200 Subject: mptcp: sched: check both directions for backup The 'mptcp_subflow_context' structure has two items related to the backup flags: - 'backup': the subflow has been marked as backup by the other peer - 'request_bkup': the backup flag has been set by the host Before this patch, the scheduler was only looking at the 'backup' flag. That can make sense in some cases, but it looks like that's not what we wanted for the general use, because either the path-manager was setting both of them when sending an MP_PRIO, or the receiver was duplicating the 'backup' flag in the subflow request. Note that the use of these two flags in the path-manager are going to be fixed in the next commits, but this change here is needed not to modify the behaviour. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- include/trace/events/mptcp.h | 2 +- net/mptcp/protocol.c | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index 09e72215b9f9..085b749cdd97 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -34,7 +34,7 @@ TRACE_EVENT(mptcp_subflow_get_send, struct sock *ssk; __entry->active = mptcp_subflow_active(subflow); - __entry->backup = subflow->backup; + __entry->backup = subflow->backup || subflow->request_bkup; if (subflow->tcp_sock && sk_fullsock(subflow->tcp_sock)) __entry->free = sk_stream_memory_free(subflow->tcp_sock); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a26c2c840fd9..a2fc54ed68c0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1422,13 +1422,15 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } mptcp_for_each_subflow(msk, subflow) { + bool backup = subflow->backup || subflow->request_bkup; + trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); - nr_active += !subflow->backup; + nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ @@ -1439,9 +1441,9 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); - if (linger_time < send_info[subflow->backup].linger_time) { - send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].linger_time = linger_time; + if (linger_time < send_info[backup].linger_time) { + send_info[backup].ssk = ssk; + send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); -- cgit From efd340bf3d7779a3a8ec954d8ec0fb8a10f24982 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:24 +0200 Subject: mptcp: distinguish rcv vs sent backup flag in requests When sending an MP_JOIN + SYN + ACK, it is possible to mark the subflow as 'backup' by setting the flag with the same name. Before this patch, the backup was set if the other peer set it in its MP_JOIN + SYN request. It is not correct: the backup flag should be set in the MPJ+SYN+ACK only if the host asks for it, and not mirroring what was done by the other peer. It is then required to have a dedicated bit for each direction, similar to what is done in the subflow context. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/options.c | 2 +- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8e8dcfbc2993..8a68382a4fe9 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -909,7 +909,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; - opts->backup = subflow_req->backup; + opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b11a4e50d52b..b8b25124e7de 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -448,6 +448,7 @@ struct mptcp_subflow_request_sock { u16 mp_capable : 1, mp_join : 1, backup : 1, + request_bkup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 39e2cbdf3801..a3778aee4e77 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2005,6 +2005,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->fully_established = 1; new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; + new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; -- cgit From 4258b94831bb7ff28ab80e3c8d94db37db930728 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:25 +0200 Subject: mptcp: pm: only set request_bkup flag when sending MP_PRIO The 'backup' flag from mptcp_subflow_context structure is supposed to be set only when the other peer flagged a subflow as backup, not the opposite. Fixes: 067065422fcd ("mptcp: add the outgoing MP_PRIO support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f65831de5c1a..7635fac91539 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -471,7 +471,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; - subflow->backup = backup; subflow->request_bkup = backup; } -- cgit From 4dde0d72ccec500c60c798e036b852e013d6e124 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:26 +0200 Subject: mptcp: mib: count MPJ with backup flag Without such counters, it is difficult to easily debug issues with MPJ not having the backup flags on production servers. This is not strictly a fix, but it eases to validate the following patches without requiring to take packet traces, to query ongoing connections with Netlink with admin permissions, or to guess by looking at the behaviour of the packet scheduler. Also, the modification is self contained, isolated, well controlled, and the increments are done just after others, there from the beginning. It looks then safe, and helpful to backport this. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/mib.c | 2 ++ net/mptcp/mib.h | 2 ++ net/mptcp/subflow.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index c30405e76833..7884217f33eb 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -19,7 +19,9 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 2704afd0dfe4..66aa67f49d03 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -14,7 +14,9 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a3778aee4e77..be406197b1c4 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -168,6 +168,9 @@ static int subflow_check_req(struct request_sock *req, return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + + if (mp_opt.backup) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } if (opt_mp_capable && listener->request_mptcp) { @@ -577,6 +580,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + if (subflow->backup) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); + if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), -- cgit From 935ff5bb8a1cfcdf8e60c8f5c794d0bbbc234437 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:27 +0200 Subject: selftests: mptcp: join: validate backup in MPJ A peer can notify the other one that a subflow has to be treated as "backup" by two different ways: either by sending a dedicated MP_PRIO notification, or by setting the backup flag in the MP_JOIN handshake. The selftests were previously monitoring the former, but not the latter. This is what is now done here by looking at these new MIB counters when validating the 'backup' cases: MPTcpExtMPJoinSynBackupRx MPTcpExtMPJoinSynAckBackupRx The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it will help to validate a new fix for an issue introduced by this commit ID. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 42 +++++++++++++++++++------ 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 55d84a1bde15..e6c8d86017f3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1634,6 +1634,8 @@ chk_prio_nr() { local mp_prio_nr_tx=$1 local mp_prio_nr_rx=$2 + local mpj_syn=$3 + local mpj_syn_ack=$4 local count print_check "ptx" @@ -1655,6 +1657,26 @@ chk_prio_nr() else print_ok fi + + print_check "syn backup" + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynBackupRx") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$mpj_syn" ]; then + fail_test "got $count JOIN[s] syn with Backup expected $mpj_syn" + else + print_ok + fi + + print_check "synack backup" + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckBackupRx") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$mpj_syn_ack" ]; then + fail_test "got $count JOIN[s] synack with Backup expected $mpj_syn_ack" + else + print_ok + fi } chk_subflow_nr() @@ -2612,7 +2634,7 @@ backup_tests() sflags=nobackup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 fi # single address, backup @@ -2625,7 +2647,7 @@ backup_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi # single address with port, backup @@ -2638,7 +2660,7 @@ backup_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi if reset "mpc backup" && @@ -2647,7 +2669,7 @@ backup_tests() speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 0 1 + chk_prio_nr 0 1 0 0 fi if reset "mpc backup both sides" && @@ -2657,7 +2679,7 @@ backup_tests() speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi if reset "mpc switch to backup" && @@ -2666,7 +2688,7 @@ backup_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 0 1 + chk_prio_nr 0 1 0 0 fi if reset "mpc switch to backup both sides" && @@ -2676,7 +2698,7 @@ backup_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_prio_nr 1 1 + chk_prio_nr 1 1 0 0 fi } @@ -3053,7 +3075,7 @@ fullmesh_tests() addr_nr_ns2=1 sflags=backup,fullmesh speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi @@ -3066,7 +3088,7 @@ fullmesh_tests() sflags=nobackup,nofullmesh speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_prio_nr 0 1 + chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi } @@ -3318,7 +3340,7 @@ userspace_tests() sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 0 - chk_prio_nr 0 0 + chk_prio_nr 0 0 0 0 fi # userspace pm type prevents rm_addr -- cgit From 6834097fc38c5416701c793da94558cea49c0a1f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:28 +0200 Subject: mptcp: pm: fix backup support in signal endpoints There was a support for signal endpoints, but only when the endpoint's flag was changed during a connection. If an endpoint with the signal and backup was already present, the MP_JOIN reply was not containing the backup flag as expected. That's confusing to have this inconsistent behaviour. On the other hand, the infrastructure to set the backup flag in the SYN + ACK + MP_JOIN was already there, it was just never set before. Now when requesting the local ID from the path-manager, the backup status is also requested. Note that when the userspace PM is used, the backup flag can be set if the local address was already used before with a backup flag, e.g. if the address was announced with the 'backup' flag, or a subflow was created with the 'backup' flag. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/507 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/pm.c | 12 ++++++++++++ net/mptcp/pm_netlink.c | 18 ++++++++++++++++++ net/mptcp/pm_userspace.c | 18 ++++++++++++++++++ net/mptcp/protocol.h | 3 +++ net/mptcp/subflow.c | 3 +++ 5 files changed, 54 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 55406720c607..23bb89c94e90 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -426,6 +426,18 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, &skc_local); } +bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) +{ + struct mptcp_addr_info skc_local; + + mptcp_local_address((struct sock_common *)skc, &skc_local); + + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_is_backup(msk, &skc_local); + + return mptcp_pm_nl_is_backup(msk, &skc_local); +} + int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7635fac91539..37954a0b087d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1101,6 +1101,24 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc return ret; } +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + struct mptcp_pm_addr_entry *entry; + bool backup = false; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + break; + } + } + rcu_read_unlock(); + + return backup; +} + #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index f0a4590506c6..8eaa9fbe3e34 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -165,6 +165,24 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); } +bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, + struct mptcp_addr_info *skc) +{ + struct mptcp_pm_addr_entry *entry; + bool backup = false; + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, skc, false)) { + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + break; + } + } + spin_unlock_bh(&msk->pm.lock); + + return backup; +} + int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b8b25124e7de..60c6b073d65f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1109,6 +1109,9 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be406197b1c4..0e4b5bfbeaa1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -100,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) return NULL; } subflow_req->local_id = local_id; + subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } @@ -620,6 +621,8 @@ static int subflow_chk_local_id(struct sock *sk) return err; subflow_set_local_id(subflow, err); + subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); + return 0; } -- cgit From f833470c27832136d4416d8fc55d658082af0989 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 27 Jul 2024 12:01:29 +0200 Subject: selftests: mptcp: join: check backup support in signal endp Before the previous commit, 'signal' endpoints with the 'backup' flag were ignored when sending the MP_JOIN. The MPTCP Join selftest has then been modified to validate this case: the "single address, backup" test, is now validating the MP_JOIN with a backup flag as it is what we expect it to do with such name. The previous version has been kept, but renamed to "single address, switch to backup" to avoid confusions. The "single address with port, backup" test is also now validating the MPJ with a backup flag, which makes more sense than checking the switch to backup with an MP_PRIO. The "mpc backup both sides" test is now validating that the backup flag is also set in MP_JOIN from and to the addresses used in the initial subflow, using the special ID 0. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 34 ++++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e6c8d86017f3..4df48f1f14ab 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2639,6 +2639,19 @@ backup_tests() # single address, backup if reset "single address, backup" && + continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 1 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup + pm_nl_set_limits $ns2 1 1 + sflags=nobackup speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 + chk_prio_nr 1 0 0 1 + fi + + # single address, switch to backup + if reset "single address, switch to backup" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 1 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal @@ -2654,13 +2667,13 @@ backup_tests() if reset "single address with port, backup" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup port 10100 pm_nl_set_limits $ns2 1 1 - sflags=backup speed=slow \ + sflags=nobackup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 chk_add_nr 1 1 - chk_prio_nr 1 1 0 0 + chk_prio_nr 1 0 0 1 fi if reset "mpc backup" && @@ -2674,12 +2687,21 @@ backup_tests() if reset "mpc backup both sides" && continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then - pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 2 + pm_nl_add_endpoint $ns1 10.0.1.1 flags signal,backup pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup + + # 10.0.2.2 (non-backup) -> 10.0.1.1 (backup) + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow + # 10.0.1.2 (backup) -> 10.0.2.1 (non-backup) + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + ip -net "$ns2" route add 10.0.2.1 via 10.0.1.1 dev ns2eth1 # force this path + speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - chk_prio_nr 1 1 0 0 + chk_join_nr 2 2 2 + chk_prio_nr 1 1 1 1 fi if reset "mpc switch to backup" && -- cgit From 2fe5273f149cc882c371f9954b5fdbd1bd8c9b5c Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 29 Jul 2024 11:40:15 +0800 Subject: net/smc: prevent UAF in inet_create() Following syzbot repro crashes the kernel: socketpair(0x2, 0x1, 0x100, &(0x7f0000000140)) (fail_nth: 13) Fix this by not calling sk_common_release() from smc_create_clcsk(). Stack trace: socket: no more sockets ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 5092 at lib/refcount.c:28 refcount_warn_saturate+0x15a/0x1d0 lib/refcount.c:28 Modules linked in: CPU: 1 PID: 5092 Comm: syz-executor424 Not tainted 6.10.0-syzkaller-04483-g0be9ae5486cd #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024 RIP: 0010:refcount_warn_saturate+0x15a/0x1d0 lib/refcount.c:28 Code: 80 f3 1f 8c e8 e7 69 a8 fc 90 0f 0b 90 90 eb 99 e8 cb 4f e6 fc c6 05 8a 8d e8 0a 01 90 48 c7 c7 e0 f3 1f 8c e8 c7 69 a8 fc 90 <0f> 0b 90 90 e9 76 ff ff ff e8 a8 4f e6 fc c6 05 64 8d e8 0a 01 90 RSP: 0018:ffffc900034cfcf0 EFLAGS: 00010246 RAX: 3b9fcde1c862f700 RBX: ffff888022918b80 RCX: ffff88807b39bc00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000003 R08: ffffffff815878a2 R09: fffffbfff1c39d94 R10: dffffc0000000000 R11: fffffbfff1c39d94 R12: 00000000ffffffe9 R13: 1ffff11004523165 R14: ffff888022918b28 R15: ffff888022918b00 FS: 00005555870e7380(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000140 CR3: 000000007582e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: inet_create+0xbaf/0xe70 __sock_create+0x490/0x920 net/socket.c:1571 sock_create net/socket.c:1622 [inline] __sys_socketpair+0x2ca/0x720 net/socket.c:1769 __do_sys_socketpair net/socket.c:1822 [inline] __se_sys_socketpair net/socket.c:1819 [inline] __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1819 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbcb9259669 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 a1 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fffe931c6d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000035 RAX: ffffffffffffffda RBX: 00007fffe931c6f0 RCX: 00007fbcb9259669 RDX: 0000000000000100 RSI: 0000000000000001 RDI: 0000000000000002 RBP: 0000000000000002 R08: 00007fffe931c476 R09: 00000000000000a0 R10: 0000000020000140 R11: 0000000000000246 R12: 00007fffe931c6ec R13: 431bde82d7b634db R14: 0000000000000001 R15: 0000000000000001 Link: https://lore.kernel.org/r/20240723175809.537291-1-edumazet@google.com/ Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Reported-by: syzbot Signed-off-by: D. Wythe Reviewed-by: Eric Dumazet Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/1722224415-30999-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 73a875573e7a..8e3093938cd2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3319,10 +3319,8 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); - if (rc) { - sk_common_release(sk); + if (rc) return rc; - } /* smc_clcsock_release() does not wait smc->clcsock->sk's * destruction; its sk_state might not be TCP_CLOSE after @@ -3368,6 +3366,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; else rc = smc_create_clcsk(net, sk, family); + + if (rc) + sk_common_release(sk); out: return rc; } -- cgit From df615907f1bf907260af01ccb904d0e9304b5278 Mon Sep 17 00:00:00 2001 From: Patryk Duda Date: Tue, 30 Jul 2024 10:44:25 +0000 Subject: platform/chrome: cros_ec_proto: Lock device when updating MKBP version The cros_ec_get_host_command_version_mask() function requires that the caller must have ec_dev->lock mutex before calling it. This requirement was not met and as a result it was possible that two commands were sent to the device at the same time. The problem was observed while using UART backend which doesn't use any additional locks, unlike SPI backend which locks the controller until response is received. Fixes: f74c7557ed0d ("platform/chrome: cros_ec_proto: Update version on GET_NEXT_EVENT failure") Cc: stable@vger.kernel.org Signed-off-by: Patryk Duda Link: https://lore.kernel.org/r/20240730104425.607083-1-patrykd@google.com Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_proto.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index f776fd42244f..73f75958e15c 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -813,9 +813,11 @@ int cros_ec_get_next_event(struct cros_ec_device *ec_dev, if (ret == -ENOPROTOOPT) { dev_dbg(ec_dev->dev, "GET_NEXT_EVENT returned invalid version error.\n"); + mutex_lock(&ec_dev->lock); ret = cros_ec_get_host_command_version_mask(ec_dev, EC_CMD_GET_NEXT_EVENT, &ver_mask); + mutex_unlock(&ec_dev->lock); if (ret < 0 || ver_mask == 0) /* * Do not change the MKBP supported version if we can't -- cgit From f558120cd709682b739207b48cf7479fd9568431 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 29 Jul 2024 14:28:16 +0200 Subject: net/iucv: fix use after free in iucv_sock_close() iucv_sever_path() is called from process context and from bh context. iucv->path is used as indicator whether somebody else is taking care of severing the path (or it is already removed / never existed). This needs to be done with atomic compare and swap, otherwise there is a small window where iucv_sock_close() will try to work with a path that has already been severed and freed by iucv_callback_connrej() called by iucv_tasklet_fn(). Example: [452744.123844] Call Trace: [452744.123845] ([<0000001e87f03880>] 0x1e87f03880) [452744.123966] [<00000000d593001e>] iucv_path_sever+0x96/0x138 [452744.124330] [<000003ff801ddbca>] iucv_sever_path+0xc2/0xd0 [af_iucv] [452744.124336] [<000003ff801e01b6>] iucv_sock_close+0xa6/0x310 [af_iucv] [452744.124341] [<000003ff801e08cc>] iucv_sock_release+0x3c/0xd0 [af_iucv] [452744.124345] [<00000000d574794e>] __sock_release+0x5e/0xe8 [452744.124815] [<00000000d5747a0c>] sock_close+0x34/0x48 [452744.124820] [<00000000d5421642>] __fput+0xba/0x268 [452744.124826] [<00000000d51b382c>] task_work_run+0xbc/0xf0 [452744.124832] [<00000000d5145710>] do_notify_resume+0x88/0x90 [452744.124841] [<00000000d5978096>] system_call+0xe2/0x2c8 [452744.125319] Last Breaking-Event-Address: [452744.125321] [<00000000d5930018>] iucv_path_sever+0x90/0x138 [452744.125324] [452744.125325] Kernel panic - not syncing: Fatal exception in interrupt Note that bh_lock_sock() is not serializing the tasklet context against process context, because the check for sock_owned_by_user() and corresponding handling is missing. Ideas for a future clean-up patch: A) Correct usage of bh_lock_sock() in tasklet context, as described in Link: https://lore.kernel.org/netdev/1280155406.2899.407.camel@edumazet-laptop/ Re-enqueue, if needed. This may require adding return values to the tasklet functions and thus changes to all users of iucv. B) Change iucv tasklet into worker and use only lock_sock() in af_iucv. Fixes: 7d316b945352 ("af_iucv: remove IUCV-pathes completely") Reviewed-by: Halil Pasic Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20240729122818.947756-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- net/iucv/af_iucv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c3b0b610b0aa..c00323fa9eb6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -335,8 +335,8 @@ static void iucv_sever_path(struct sock *sk, int with_user_data) struct iucv_sock *iucv = iucv_sk(sk); struct iucv_path *path = iucv->path; - if (iucv->path) { - iucv->path = NULL; + /* Whoever resets the path pointer, must sever and free it. */ + if (xchg(&iucv->path, NULL)) { if (with_user_data) { low_nmcpy(user_data, iucv->src_name); high_nmcpy(user_data, iucv->dst_name); -- cgit From b8e947e9f64cac9df85a07672b658df5b2bcff07 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 29 Jul 2024 21:59:24 +0200 Subject: btrfs: initialize location to fix -Wmaybe-uninitialized in btrfs_lookup_dentry() Some arch + compiler combinations report a potentially unused variable location in btrfs_lookup_dentry(). This is a false alert as the variable is passed by value and always valid or there's an error. The compilers cannot probably reason about that although btrfs_inode_by_name() is in the same file. > + /kisskb/src/fs/btrfs/inode.c: error: 'location.objectid' may be used +uninitialized in this function [-Werror=maybe-uninitialized]: => 5603:9 > + /kisskb/src/fs/btrfs/inode.c: error: 'location.type' may be used +uninitialized in this function [-Werror=maybe-uninitialized]: => 5674:5 m68k-gcc8/m68k-allmodconfig mips-gcc8/mips-allmodconfig powerpc-gcc5/powerpc-all{mod,yes}config powerpc-gcc5/ppc64_defconfig Initialize it to zero, this should fix the warnings and won't change the behaviour as btrfs_inode_by_name() accepts only a root or inode item types, otherwise returns an error. Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/linux-btrfs/bd4e9928-17b3-9257-8ba7-6b7f9bbb639a@linux-m68k.org/ Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ca3878348ff..1d4e0a65494a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5664,7 +5664,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; - struct btrfs_key location; + struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; -- cgit From 0aa3ca956c46d849775eae1816cef8fe4bc8b50e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Jul 2024 11:06:56 -0500 Subject: net: mvpp2: Don't re-use loop iterator This function has a nested loop. The problem is that both the inside and outside loop use the same variable as an iterator. I found this via static analysis so I'm not sure the impact. It could be that it loops forever or, more likely, the loop exits early. Fixes: 3a616b92a9d1 ("net: mvpp2: Add TX flow control support for jumbo frames") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/eaa8f403-7779-4d81-973d-a9ecddc0bf6f@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 8c45ad983abc..0d62a33afa80 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -953,13 +953,13 @@ static void mvpp2_bm_pool_update_fc(struct mvpp2_port *port, static void mvpp2_bm_pool_update_priv_fc(struct mvpp2 *priv, bool en) { struct mvpp2_port *port; - int i; + int i, j; for (i = 0; i < priv->port_count; i++) { port = priv->port_list[i]; if (port->priv->percpu_pools) { - for (i = 0; i < port->nrxqs; i++) - mvpp2_bm_pool_update_fc(port, &port->priv->bm_pools[i], + for (j = 0; j < port->nrxqs; j++) + mvpp2_bm_pool_update_fc(port, &port->priv->bm_pools[j], port->tx_fc & en); } else { mvpp2_bm_pool_update_fc(port, port->pool_long, port->tx_fc & en); -- cgit From 22f5468731491e53356ba7c028f0fdea20b18e2c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 Jul 2024 10:36:47 -0700 Subject: minmax: improve macro expansion and type checking This clarifies the rules for min()/max()/clamp() type checking and makes them a much more efficient macro expansion. In particular, we now look at the type and range of the inputs to see whether they work together, generating a mask of acceptable comparisons, and then just verifying that the inputs have a shared case: - an expression with a signed type can be used for (1) signed comparisons (2) unsigned comparisons if it is statically known to have a non-negative value - an expression with an unsigned type can be used for (3) unsigned comparison (4) signed comparisons if the type is smaller than 'int' and thus the C integer promotion rules will make it signed anyway Here rule (1) and (3) are obvious, and rule (2) is important in order to allow obvious trivial constants to be used together with unsigned values. Rule (4) is not necessarily a good idea, but matches what we used to do, and we have extant cases of this situation in the kernel. Notably with bcachefs having an expression like min(bch2_bucket_sectors_dirty(a), ca->mi.bucket_size) where bch2_bucket_sectors_dirty() returns an 's64', and 'ca->mi.bucket_size' is of type 'u16'. Technically that bcachefs comparison is clearly sensible on a C type level, because the 'u16' will go through the normal C integer promotion, and become 'int', and then we're comparing two signed values and everything looks sane. However, it's not entirely clear that a 'min(s64,u16)' operation makes a lot of conceptual sense, and it's possible that we will remove rule (4). After all, the _reason_ we have these complicated type checks is exactly that the C type promotion rules are not very intuitive. But at least for now the rule is in place for backwards compatibility. Also note that rule (2) existed before, but is hugely relaxed by this commit. It used to be true only for the simplest compile-time non-negative integer constants. The new macro model will allow cases where the compiler can trivially see that an expression is non-negative even if it isn't necessarily a constant. For example, the amdgpu driver does min_t(size_t, sizeof(fru_info->serial), pia[addr] & 0x3F)); because our old 'min()' macro would see that 'pia[addr] & 0x3F' is of type 'int' and clearly not a C constant expression, so doing a 'min()' with a 'size_t' is a signedness violation. Our new 'min()' macro still sees that 'pia[addr] & 0x3F' is of type 'int', but is smart enough to also see that it is clearly non-negative, and thus would allow that case without any complaints. Cc: Arnd Bergmann Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 9 ++++++ include/linux/minmax.h | 74 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2594553bb30b..2df665fa2964 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -296,6 +296,15 @@ static inline void *offset_to_ptr(const int *off) #define is_signed_type(type) (((type)(-1)) < (__force type)1) #define is_unsigned_type(type) (!is_signed_type(type)) +/* + * Useful shorthand for "is this condition known at compile-time?" + * + * Note that the condition may involve non-constant values, + * but the compiler may know enough about the details of the + * values to determine that the condition is statically true. + */ +#define statically_true(x) (__builtin_constant_p(x) && (x)) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. diff --git a/include/linux/minmax.h b/include/linux/minmax.h index e3e4353df983..41da6f85a407 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -26,19 +26,63 @@ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) -/* is_signed_type() isn't a constexpr for pointer types */ -#define __is_signed(x) \ - __builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))), \ - is_signed_type(typeof(x)), 0) +/* + * __sign_use for integer expressions: + * bit #0 set if ok for unsigned comparisons + * bit #1 set if ok for signed comparisons + * + * In particular, statically non-negative signed integer + * expressions are ok for both. + * + * NOTE! Unsigned types smaller than 'int' are implicitly + * converted to 'int' in expressions, and are accepted for + * signed conversions for now. This is debatable. + * + * Note that 'x' is the original expression, and 'ux' is + * the unique variable that contains the value. + * + * We use 'ux' for pure type checking, and 'x' for when + * we need to look at the value (but without evaluating + * it for side effects! Careful to only ever evaluate it + * with sizeof() or __builtin_constant_p() etc). + * + * Pointers end up being checked by the normal C type + * rules at the actual comparison, and these expressions + * only need to be careful to not cause warnings for + * pointer use. + */ +#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) +#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) +#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ + __signed_type_use(x,ux):__unsigned_type_use(x,ux)) + +/* + * To avoid warnings about casting pointers to integers + * of different sizes, we need that special sign type. + * + * On 64-bit we can just always use 'long', since any + * integer or pointer type can just be cast to that. + * + * This does not work for 128-bit signed integers since + * the cast would truncate them, but we do not use s128 + * types in the kernel (we do use 'u128', but they will + * be handled by the !is_signed_type() case). + * + * NOTE! The cast is there only to avoid any warnings + * from when values that aren't signed integer types. + */ +#ifdef CONFIG_64BIT + #define __signed_type(ux) long +#else + #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) +#endif +#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) -/* True for a non-negative signed int constant */ -#define __is_noneg_int(x) \ - (__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0) +#define __types_ok(x,y,ux,uy) \ + (__sign_use(x,ux) & __sign_use(y,uy)) -#define __types_ok(x, y, ux, uy) \ - (__is_signed(ux) == __is_signed(uy) || \ - __is_signed((ux) + 0) == __is_signed((uy) + 0) || \ - __is_noneg_int(x) || __is_noneg_int(y)) +#define __types_ok3(x,y,z,ux,uy,uz) \ + (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -53,8 +97,8 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - static_assert(__types_ok(x, y, ux, uy), \ - #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ + BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) #define __careful_cmp(op, x, y) \ @@ -70,8 +114,8 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - static_assert(__types_ok(uval, lo, uval, ulo), "clamp() 'lo' signedness error"); \ - static_assert(__types_ok(uval, hi, uval, uhi), "clamp() 'hi' signedness error"); \ + BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) #define __careful_clamp(val, lo, hi) \ -- cgit From 7764b9622db4382b2797b54a70f292c8da6ef417 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 26 Jul 2024 20:08:47 +0200 Subject: bpf/selftests: Fix ASSERT_OK condition check in uprobe_syscall test Fixing ASSERT_OK condition check in uprobe_syscall test, otherwise we return from test on pipe success. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240726180847.684584-1-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index bd8c75b620c2..797de47f8197 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -253,7 +253,7 @@ static void test_uretprobe_syscall_call(void) struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c; - if (ASSERT_OK(pipe(go), "pipe")) + if (!ASSERT_OK(pipe(go), "pipe")) return; skel = uprobe_syscall_executed__open_and_load(); -- cgit From 84383b5ef4cd21b4a67de92afdc05a03b5247db9 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Thu, 25 Jul 2024 12:41:25 +0530 Subject: net: phy: micrel: Fix the KSZ9131 MDI-X status issue The MDIX status is not accurately reflecting the current state after the link partner has manually altered its MDIX configuration while operating in forced mode. Access information about Auto mdix completion and pair selection from the KSZ9131's Auto/MDI/MDI-X status register Fixes: b64e6a8794d9 ("net: phy: micrel: Add PHY Auto/MDI/MDI-X set driver for KSZ9131") Signed-off-by: Raju Lakkaraju Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240725071125.13960-1-Raju.Lakkaraju@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index dd519805deee..65b0a3115e14 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1389,6 +1389,8 @@ static int ksz9131_config_init(struct phy_device *phydev) const struct device *dev_walker; int ret; + phydev->mdix_ctrl = ETH_TP_MDI_AUTO; + dev_walker = &phydev->mdio.dev; do { of_node = dev_walker->of_node; @@ -1438,28 +1440,30 @@ static int ksz9131_config_init(struct phy_device *phydev) #define MII_KSZ9131_AUTO_MDIX 0x1C #define MII_KSZ9131_AUTO_MDI_SET BIT(7) #define MII_KSZ9131_AUTO_MDIX_SWAP_OFF BIT(6) +#define MII_KSZ9131_DIG_AXAN_STS 0x14 +#define MII_KSZ9131_DIG_AXAN_STS_LINK_DET BIT(14) +#define MII_KSZ9131_DIG_AXAN_STS_A_SELECT BIT(12) static int ksz9131_mdix_update(struct phy_device *phydev) { int ret; - ret = phy_read(phydev, MII_KSZ9131_AUTO_MDIX); - if (ret < 0) - return ret; - - if (ret & MII_KSZ9131_AUTO_MDIX_SWAP_OFF) { - if (ret & MII_KSZ9131_AUTO_MDI_SET) - phydev->mdix_ctrl = ETH_TP_MDI; - else - phydev->mdix_ctrl = ETH_TP_MDI_X; + if (phydev->mdix_ctrl != ETH_TP_MDI_AUTO) { + phydev->mdix = phydev->mdix_ctrl; } else { - phydev->mdix_ctrl = ETH_TP_MDI_AUTO; - } + ret = phy_read(phydev, MII_KSZ9131_DIG_AXAN_STS); + if (ret < 0) + return ret; - if (ret & MII_KSZ9131_AUTO_MDI_SET) - phydev->mdix = ETH_TP_MDI; - else - phydev->mdix = ETH_TP_MDI_X; + if (ret & MII_KSZ9131_DIG_AXAN_STS_LINK_DET) { + if (ret & MII_KSZ9131_DIG_AXAN_STS_A_SELECT) + phydev->mdix = ETH_TP_MDI; + else + phydev->mdix = ETH_TP_MDI_X; + } else { + phydev->mdix = ETH_TP_MDI_INVALID; + } + } return 0; } -- cgit From a7f3abcf635767b2e19c4c55c4c35756595ebc86 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 29 Jul 2024 17:03:14 +0200 Subject: net: phy: aquantia: only poll GLOBAL_CFG regs on aqr113, aqr113c and aqr115c Commit 708405f3e56e ("net: phy: aquantia: wait for the GLOBAL_CFG to start returning real values") introduced a workaround for an issue observed on aqr115c. However there were never any reports of it happening on other models and the workaround has been reported to cause and issue on aqr113c (and it may cause the same on any other model not supporting 10M mode). Let's limit the impact of the workaround to aqr113, aqr113c and aqr115c and poll the 100M GLOBAL_CFG register instead as both models are known to support it correctly. Reported-by: Jon Hunter Closes: https://lore.kernel.org/lkml/7c0140be-4325-4005-9068-7e0fc5ff344d@nvidia.com/ Fixes: 708405f3e56e ("net: phy: aquantia: wait for the GLOBAL_CFG to start returning real values") Tested-by: Jon Hunter Signed-off-by: Bartosz Golaszewski Reviewed-by: Antoine Tenart Link: https://patch.msgid.link/20240729150315.65798-1-brgl@bgdev.pl Signed-off-by: Jakub Kicinski --- drivers/net/phy/aquantia/aquantia_main.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/net/phy/aquantia/aquantia_main.c b/drivers/net/phy/aquantia/aquantia_main.c index d12e35374231..e982e9ce44a5 100644 --- a/drivers/net/phy/aquantia/aquantia_main.c +++ b/drivers/net/phy/aquantia/aquantia_main.c @@ -653,13 +653,7 @@ static int aqr107_fill_interface_modes(struct phy_device *phydev) unsigned long *possible = phydev->possible_interfaces; unsigned int serdes_mode, rate_adapt; phy_interface_t interface; - int i, val, ret; - - ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, - VEND1_GLOBAL_CFG_10M, val, val != 0, - 1000, 100000, false); - if (ret) - return ret; + int i, val; /* Walk the media-speed configuration registers to determine which * host-side serdes modes may be used by the PHY depending on the @@ -708,6 +702,25 @@ static int aqr107_fill_interface_modes(struct phy_device *phydev) return 0; } +static int aqr113c_fill_interface_modes(struct phy_device *phydev) +{ + int val, ret; + + /* It's been observed on some models that - when coming out of suspend + * - the FW signals that the PHY is ready but the GLOBAL_CFG registers + * continue on returning zeroes for some time. Let's poll the 100M + * register until it returns a real value as both 113c and 115c support + * this mode. + */ + ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, + VEND1_GLOBAL_CFG_100M, val, val != 0, + 1000, 100000, false); + if (ret) + return ret; + + return aqr107_fill_interface_modes(phydev); +} + static int aqr113c_config_init(struct phy_device *phydev) { int ret; @@ -725,7 +738,7 @@ static int aqr113c_config_init(struct phy_device *phydev) if (ret) return ret; - return aqr107_fill_interface_modes(phydev); + return aqr113c_fill_interface_modes(phydev); } static int aqr107_probe(struct phy_device *phydev) -- cgit From 89add40066f9ed9abe5f7f886fe5789ff7e0c50e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 29 Jul 2024 16:10:12 -0400 Subject: net: drop bad gso csum_start and offset in virtio_net_hdr Tighten csum_start and csum_offset checks in virtio_net_hdr_to_skb for GSO packets. The function already checks that a checksum requested with VIRTIO_NET_HDR_F_NEEDS_CSUM is in skb linear. But for GSO packets this might not hold for segs after segmentation. Syzkaller demonstrated to reach this warning in skb_checksum_help offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (WARN_ON_ONCE(offset >= skb_headlen(skb))) By injecting a TSO packet: WARNING: CPU: 1 PID: 3539 at net/core/dev.c:3284 skb_checksum_help+0x3d0/0x5b0 ip_do_fragment+0x209/0x1b20 net/ipv4/ip_output.c:774 ip_finish_output_gso net/ipv4/ip_output.c:279 [inline] __ip_finish_output+0x2bd/0x4b0 net/ipv4/ip_output.c:301 iptunnel_xmit+0x50c/0x930 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x2296/0x2c70 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x759/0xa60 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4850 [inline] netdev_start_xmit include/linux/netdevice.h:4864 [inline] xmit_one net/core/dev.c:3595 [inline] dev_hard_start_xmit+0x261/0x8c0 net/core/dev.c:3611 __dev_queue_xmit+0x1b97/0x3c90 net/core/dev.c:4261 packet_snd net/packet/af_packet.c:3073 [inline] The geometry of the bad input packet at tcp_gso_segment: [ 52.003050][ T8403] skb len=12202 headroom=244 headlen=12093 tailroom=0 [ 52.003050][ T8403] mac=(168,24) mac_len=24 net=(192,52) trans=244 [ 52.003050][ T8403] shinfo(txflags=0 nr_frags=1 gso(size=1552 type=3 segs=0)) [ 52.003050][ T8403] csum(0x60000c7 start=199 offset=1536 ip_summed=3 complete_sw=0 valid=0 level=0) Mitigate with stricter input validation. csum_offset: for GSO packets, deduce the correct value from gso_type. This is already done for USO. Extend it to TSO. Let UFO be: udp[46]_ufo_fragment ignores these fields and always computes the checksum in software. csum_start: finding the real offset requires parsing to the transport header. Do not add a parser, use existing segmentation parsing. Thanks to SKB_GSO_DODGY, that also catches bad packets that are hw offloaded. Again test both TSO and USO. Do not test UFO for the above reason, and do not test UDP tunnel offload. GSO packet are almost always CHECKSUM_PARTIAL. USO packets may be CHECKSUM_NONE since commit 10154dbded6d6 ("udp: Allow GSO transmit from devices with no checksum offload"), but then still these fields are initialized correctly in udp4_hwcsum/udp6_hwcsum_outgoing. So no need to test for ip_summed == CHECKSUM_PARTIAL first. This revises an existing fix mentioned in the Fixes tag, which broke small packets with GSO offload, as detected by kselftests. Link: https://syzkaller.appspot.com/bug?extid=e1db31216c789f552871 Link: https://lore.kernel.org/netdev/20240723223109.2196886-1-kuba@kernel.org Fixes: e269d79c7d35 ("net: missing check virtio") Cc: stable@vger.kernel.org Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20240729201108.1615114-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 16 +++++----------- net/ipv4/tcp_offload.c | 3 +++ net/ipv4/udp_offload.c | 4 ++++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index d1d7825318c3..6c395a2600e8 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -56,7 +56,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; - u64 ret, remainder, gso_size; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -99,16 +98,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); - if (hdr->gso_size) { - gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); - ret = div64_u64_rem(skb->len, gso_size, &remainder); - if (!(ret && (hdr->gso_size > needed) && - ((remainder > needed) || (remainder == 0)))) { - return -EINVAL; - } - skb_shinfo(skb)->tx_flags |= SKBFL_SHARED_FRAG; - } - if (!pskb_may_pull(skb, needed)) return -EINVAL; @@ -182,6 +171,11 @@ retry: if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; + case SKB_GSO_TCPV4: + case SKB_GSO_TCPV6: + if (skb->csum_offset != offsetof(struct tcphdr, check)) + return -EINVAL; + break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4b791e74529e..e4ad3311e148 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -140,6 +140,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (thlen < sizeof(*th)) goto out; + if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb))) + goto out; + if (!pskb_may_pull(skb, thlen)) goto out; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index aa2e0a28ca61..bc8a9da750fe 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -278,6 +278,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); + if (unlikely(skb_checksum_start(gso_skb) != + skb_transport_header(gso_skb))) + return ERR_PTR(-EINVAL); + if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), -- cgit From 21b136cc63d2a9ddd60d4699552b69c214b32964 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 Jul 2024 15:44:16 -0700 Subject: minmax: fix up min3() and max3() too David Laight pointed out that we should deal with the min3() and max3() mess too, which still does excessive expansion. And our current macros are actually rather broken. In particular, the macros did this: #define min3(x, y, z) min((typeof(x))min(x, y), z) #define max3(x, y, z) max((typeof(x))max(x, y), z) and that not only is a nested expansion of possibly very complex arguments with all that involves, the typing with that "typeof()" cast is completely wrong. For example, imagine what happens in max3() if 'x' happens to be a 'unsigned char', but 'y' and 'z' are 'unsigned long'. The types are compatible, and there's no warning - but the result is just random garbage. No, I don't think we've ever hit that issue in practice, but since we now have sane infrastructure for doing this right, let's just use it. It fixes any excessive expansion, and also avoids these kinds of broken type issues. Requested-by: David Laight Acked-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 41da6f85a407..98008dd92153 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -152,13 +152,20 @@ #define umax(x, y) \ __careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull) +#define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ + __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ + BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + #op"3("#x", "#y", "#z") signedness error"); \ + __cmp(op, ux, __cmp(op, uy, uz)); }) + /** * min3 - return minimum of three values * @x: first value * @y: second value * @z: third value */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) +#define min3(x, y, z) \ + __careful_op3(min, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) /** * max3 - return maximum of three values @@ -166,7 +173,8 @@ * @y: second value * @z: third value */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) +#define max3(x, y, z) \ + __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero -- cgit From 5830aa863981d43560748aa93589c0695191d95d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 25 Jul 2024 12:28:20 -0700 Subject: netfilter: iptables: Fix null-ptr-deref in iptable_nat_table_init(). We had a report that iptables-restore sometimes triggered null-ptr-deref at boot time. [0] The problem is that iptable_nat_table_init() is exposed to user space before the kernel fully initialises netns. In the small race window, a user could call iptable_nat_table_init() that accesses net_generic(net, iptable_nat_net_id), which is available only after registering iptable_nat_net_ops. Let's call register_pernet_subsys() before xt_register_template(). [0]: bpfilter: Loaded bpfilter_umh pid 11702 Started bpfilter BUG: kernel NULL pointer dereference, address: 0000000000000013 PF: supervisor write access in kernel mode PF: error_code(0x0002) - not-present page PGD 0 P4D 0 PREEMPT SMP NOPTI CPU: 2 PID: 11879 Comm: iptables-restor Not tainted 6.1.92-99.174.amzn2023.x86_64 #1 Hardware name: Amazon EC2 c6i.4xlarge/, BIOS 1.0 10/16/2017 RIP: 0010:iptable_nat_table_init (net/ipv4/netfilter/iptable_nat.c:87 net/ipv4/netfilter/iptable_nat.c:121) iptable_nat Code: 10 4c 89 f6 48 89 ef e8 0b 19 bb ff 41 89 c4 85 c0 75 38 41 83 c7 01 49 83 c6 28 41 83 ff 04 75 dc 48 8b 44 24 08 48 8b 0c 24 <48> 89 08 4c 89 ef e8 a2 3b a2 cf 48 83 c4 10 44 89 e0 5b 5d 41 5c RSP: 0018:ffffbef902843cd0 EFLAGS: 00010246 RAX: 0000000000000013 RBX: ffff9f4b052caa20 RCX: ffff9f4b20988d80 RDX: 0000000000000000 RSI: 0000000000000064 RDI: ffffffffc04201c0 RBP: ffff9f4b29394000 R08: ffff9f4b07f77258 R09: ffff9f4b07f77240 R10: 0000000000000000 R11: ffff9f4b09635388 R12: 0000000000000000 R13: ffff9f4b1a3c6c00 R14: ffff9f4b20988e20 R15: 0000000000000004 FS: 00007f6284340000(0000) GS:ffff9f51fe280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000013 CR3: 00000001d10a6005 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl (arch/x86/kernel/dumpstack.c:259) ? show_trace_log_lvl (arch/x86/kernel/dumpstack.c:259) ? xt_find_table_lock (net/netfilter/x_tables.c:1259) ? __die_body.cold (arch/x86/kernel/dumpstack.c:478 arch/x86/kernel/dumpstack.c:420) ? page_fault_oops (arch/x86/mm/fault.c:727) ? exc_page_fault (./arch/x86/include/asm/irqflags.h:40 ./arch/x86/include/asm/irqflags.h:75 arch/x86/mm/fault.c:1470 arch/x86/mm/fault.c:1518) ? asm_exc_page_fault (./arch/x86/include/asm/idtentry.h:570) ? iptable_nat_table_init (net/ipv4/netfilter/iptable_nat.c:87 net/ipv4/netfilter/iptable_nat.c:121) iptable_nat xt_find_table_lock (net/netfilter/x_tables.c:1259) xt_request_find_table_lock (net/netfilter/x_tables.c:1287) get_info (net/ipv4/netfilter/ip_tables.c:965) ? security_capable (security/security.c:809 (discriminator 13)) ? ns_capable (kernel/capability.c:376 kernel/capability.c:397) ? do_ipt_get_ctl (net/ipv4/netfilter/ip_tables.c:1656) ? bpfilter_send_req (net/bpfilter/bpfilter_kern.c:52) bpfilter nf_getsockopt (net/netfilter/nf_sockopt.c:116) ip_getsockopt (net/ipv4/ip_sockglue.c:1827) __sys_getsockopt (net/socket.c:2327) __x64_sys_getsockopt (net/socket.c:2342 net/socket.c:2339 net/socket.c:2339) do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:81) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121) RIP: 0033:0x7f62844685ee Code: 48 8b 0d 45 28 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 09 RSP: 002b:00007ffd1f83d638 EFLAGS: 00000246 ORIG_RAX: 0000000000000037 RAX: ffffffffffffffda RBX: 00007ffd1f83d680 RCX: 00007f62844685ee RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000004 R08: 00007ffd1f83d670 R09: 0000558798ffa2a0 R10: 00007ffd1f83d680 R11: 0000000000000246 R12: 00007ffd1f83e3b2 R13: 00007f628455baa0 R14: 00007ffd1f83d7b0 R15: 00007f628457a008 Modules linked in: iptable_nat(+) bpfilter rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache veth xt_state xt_connmark xt_nat xt_statistic xt_MASQUERADE xt_mark xt_addrtype ipt_REJECT nf_reject_ipv4 nft_chain_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 xt_comment nft_compat nf_tables nfnetlink overlay nls_ascii nls_cp437 vfat fat ghash_clmulni_intel aesni_intel ena crypto_simd ptp cryptd i8042 pps_core serio button sunrpc sch_fq_codel configfs loop dm_mod fuse dax dmi_sysfs crc32_pclmul crc32c_intel efivarfs CR2: 0000000000000013 Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Takahiro Kawahara Signed-off-by: Kuniyuki Iwashima Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_nat.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 4d42d0756fd7..a5db7c67d61b 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -145,25 +145,27 @@ static struct pernet_operations iptable_nat_net_ops = { static int __init iptable_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv4_table, - iptable_nat_table_init); + int ret; + /* net->gen->ptr[iptable_nat_net_id] must be allocated + * before calling iptable_nat_table_init(). + */ + ret = register_pernet_subsys(&iptable_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&iptable_nat_net_ops); - if (ret < 0) { - xt_unregister_template(&nf_nat_ipv4_table); - return ret; - } + ret = xt_register_template(&nf_nat_ipv4_table, + iptable_nat_table_init); + if (ret < 0) + unregister_pernet_subsys(&iptable_nat_net_ops); return ret; } static void __exit iptable_nat_exit(void) { - unregister_pernet_subsys(&iptable_nat_net_ops); xt_unregister_template(&nf_nat_ipv4_table); + unregister_pernet_subsys(&iptable_nat_net_ops); } module_init(iptable_nat_init); -- cgit From c22921df777de5606f1047b1345b8d22ef1c0b34 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 25 Jul 2024 12:28:21 -0700 Subject: netfilter: iptables: Fix potential null-ptr-deref in ip6table_nat_table_init(). ip6table_nat_table_init() accesses net->gen->ptr[ip6table_nat_net_ops.id], but the function is exposed to user space before the entry is allocated via register_pernet_subsys(). Let's call register_pernet_subsys() before xt_register_template(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6table_nat.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 52cf104e3478..e119d4f090cc 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -147,23 +147,27 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv6_table, - ip6table_nat_table_init); + int ret; + /* net->gen->ptr[ip6table_nat_net_id] must be allocated + * before calling ip6t_nat_register_lookups(). + */ + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&ip6table_nat_net_ops); + ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); if (ret) - xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); return ret; } static void __exit ip6table_nat_exit(void) { - unregister_pernet_subsys(&ip6table_nat_net_ops); xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); } module_init(ip6table_nat_init); -- cgit From d516b187a9cc2e842030dd005be2735db3e8f395 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 30 Jul 2024 21:51:52 +0200 Subject: r8169: don't increment tx_dropped in case of NETDEV_TX_BUSY The skb isn't consumed in case of NETDEV_TX_BUSY, therefore don't increment the tx_dropped counter. Fixes: 188f4af04618 ("r8169: use NETDEV_TX_{BUSY/OK}") Cc: stable@vger.kernel.org Suggested-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/bbba9c48-8bac-4932-9aa1-d2ed63bc9433@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 714d2e804694..3507c2e28110 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4349,7 +4349,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, if (unlikely(!rtl_tx_slots_avail(tp))) { if (net_ratelimit()) netdev_err(dev, "BUG! Tx Ring full when queue awake!\n"); - goto err_stop_0; + netif_stop_queue(dev); + return NETDEV_TX_BUSY; } opts[1] = rtl8169_tx_vlan_tag(skb); @@ -4405,11 +4406,6 @@ err_dma_0: dev_kfree_skb_any(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; - -err_stop_0: - netif_stop_queue(dev); - dev->stats.tx_dropped++; - return NETDEV_TX_BUSY; } static unsigned int rtl_last_frag_len(struct sk_buff *skb) -- cgit From 8f73ef82985890e484efaed816b172fdf35c87aa Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jul 2024 09:14:03 -0700 Subject: net: Add skbuff.h to MAINTAINERS The network maintainers need to be copied if the skbuff.h is touched. This also helps git-send-email to figure out the proper maintainers when touching the file. Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240730161404.2028175-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c0a3d9e93689..1ca8e36e49bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15934,6 +15934,7 @@ F: include/linux/in.h F: include/linux/indirect_call_wrapper.h F: include/linux/net.h F: include/linux/netdevice.h +F: include/linux/skbuff.h F: include/net/ F: include/uapi/linux/in.h F: include/uapi/linux/net.h -- cgit From 1b75da22ed1e6171e261bc9265370162553d5393 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 30 Jul 2024 09:16:30 +0300 Subject: net/mlx5: Always drain health in shutdown callback There is no point in recovery during device shutdown. if health work started need to wait for it to avoid races and NULL pointer access. Hence, drain health WQ on shutdown callback. Fixes: 1958fc2f0712 ("net/mlx5: SF, Add auxiliary device driver") Fixes: d2aa060d40fa ("net/mlx5: Cancel health poll before sending panic teardown command") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 527da58c7953..5b7e6f4b5c7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -2142,7 +2142,6 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *dev) /* Panic tear down fw command will stop the PCI bus communication * with the HCA, so the health poll is no longer needed. */ - mlx5_drain_health_wq(dev); mlx5_stop_health_poll(dev, false); ret = mlx5_cmd_fast_teardown_hca(dev); @@ -2177,6 +2176,7 @@ static void shutdown(struct pci_dev *pdev) mlx5_core_info(dev, "Shutdown was called\n"); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); + mlx5_drain_health_wq(dev); err = mlx5_try_fast_unload(dev); if (err) mlx5_unload_one(dev, false); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index b2986175d9af..b706f1486504 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -112,6 +112,7 @@ static void mlx5_sf_dev_shutdown(struct auxiliary_device *adev) struct mlx5_core_dev *mdev = sf_dev->mdev; set_bit(MLX5_BREAK_FW_WAIT, &mdev->intf_state); + mlx5_drain_health_wq(mdev); mlx5_unload_one(mdev, false); } -- cgit From a4557b0b57c40871ff00da4f623cf79211e052f3 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 30 Jul 2024 09:16:31 +0300 Subject: net/mlx5: Fix error handling in irq_pool_request_irq In case mlx5_irq_alloc fails, the previously allocated index remains in the XArray, which could lead to inconsistencies. Fix it by adding error handling that erases the allocated index from the XArray if mlx5_irq_alloc returns an error. Fixes: c36326d38d93 ("net/mlx5: Round-Robin EQs over IRQs") Signed-off-by: Shay Drory Reviewed-by: Maher Sanalla Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c index f7b01b3f0cba..1477db7f5307 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -48,6 +48,7 @@ static struct mlx5_irq * irq_pool_request_irq(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc) { struct irq_affinity_desc auto_desc = {}; + struct mlx5_irq *irq; u32 irq_index; int err; @@ -64,9 +65,12 @@ irq_pool_request_irq(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_de else cpu_get(pool, cpumask_first(&af_desc->mask)); } - return mlx5_irq_alloc(pool, irq_index, - cpumask_empty(&auto_desc.mask) ? af_desc : &auto_desc, - NULL); + irq = mlx5_irq_alloc(pool, irq_index, + cpumask_empty(&auto_desc.mask) ? af_desc : &auto_desc, + NULL); + if (IS_ERR(irq)) + xa_erase(&pool->irqs, irq_index); + return irq; } /* Looking for the IRQ with the smallest refcount that fits req_mask. -- cgit From 94a3ad6c081381fa9ee523781789802b4ed00faf Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 30 Jul 2024 09:16:32 +0300 Subject: net/mlx5: DR, Fix 'stack guard page was hit' error in dr_rule This patch reduces the size of hw_ste_arr_optimized array that is allocated on stack from 640 bytes (5 match STEs + 5 action STES) to 448 bytes (2 match STEs + 5 action STES). This fixes the 'stack guard page was hit' issue, while still fitting majority of the usecases (up to 2 match STEs). Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index 042ca0349124..d1db04baa1fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -7,7 +7,7 @@ /* don't try to optimize STE allocation if the stack is too constaraining */ #define DR_RULE_MAX_STES_OPTIMIZED 0 #else -#define DR_RULE_MAX_STES_OPTIMIZED 5 +#define DR_RULE_MAX_STES_OPTIMIZED 2 #endif #define DR_RULE_MAX_STE_CHAIN_OPTIMIZED (DR_RULE_MAX_STES_OPTIMIZED + DR_ACTION_MAX_STES) -- cgit From 3fda84dc090390573cfbd0b1d70372663315de21 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 30 Jul 2024 09:16:33 +0300 Subject: net/mlx5: Lag, don't use the hardcoded value of the first port The cited commit didn't change the body of the loop as it should. It shouldn't be using MLX5_LAG_P1. Fixes: 7e978e7714d6 ("net/mlx5: Lag, use actual number of lag ports") Signed-off-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d0871c46b8c5..cf8045b92689 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1538,7 +1538,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, goto unlock; for (i = 0; i < ldev->ports; i++) { - if (ldev->pf[MLX5_LAG_P1].netdev == slave) { + if (ldev->pf[i].netdev == slave) { port = i; break; } -- cgit From 572f9caa9e7295f8c8822e4122c7ae8f1c412ff9 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 30 Jul 2024 09:16:34 +0300 Subject: net/mlx5: Fix missing lock on sync reset reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On sync reset reload work, when remote host updates devlink on reload actions performed on that host, it misses taking devlink lock before calling devlink_remote_reload_actions_performed() which results in triggering lock assert like the following: WARNING: CPU: 4 PID: 1164 at net/devlink/core.c:261 devl_assert_locked+0x3e/0x50 … CPU: 4 PID: 1164 Comm: kworker/u96:6 Tainted: G S W 6.10.0-rc2+ #116 Hardware name: Supermicro SYS-2028TP-DECTR/X10DRT-PT, BIOS 2.0 12/18/2015 Workqueue: mlx5_fw_reset_events mlx5_sync_reset_reload_work [mlx5_core] RIP: 0010:devl_assert_locked+0x3e/0x50 … Call Trace: ? __warn+0xa4/0x210 ? devl_assert_locked+0x3e/0x50 ? report_bug+0x160/0x280 ? handle_bug+0x3f/0x80 ? exc_invalid_op+0x17/0x40 ? asm_exc_invalid_op+0x1a/0x20 ? devl_assert_locked+0x3e/0x50 devlink_notify+0x88/0x2b0 ? mlx5_attach_device+0x20c/0x230 [mlx5_core] ? __pfx_devlink_notify+0x10/0x10 ? process_one_work+0x4b6/0xbb0 process_one_work+0x4b6/0xbb0 […] Fixes: 84a433a40d0e ("net/mlx5: Lock mlx5 devlink reload callbacks") Signed-off-by: Moshe Shemesh Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 979c49ae6b5c..b43ca0b762c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -207,6 +207,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + struct devlink *devlink = priv_to_devlink(dev); /* if this is the driver that initiated the fw reset, devlink completed the reload */ if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { @@ -218,9 +219,11 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); else mlx5_load_one(dev, true); - devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0, + devl_lock(devlink); + devlink_remote_reload_actions_performed(devlink, 0, BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); + devl_unlock(devlink); } } -- cgit From 06827e27fdcd197557be72b2229dbd362303794f Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 30 Jul 2024 09:16:35 +0300 Subject: net/mlx5e: Require mlx5 tc classifier action support for IPsec prio capability Require mlx5 classifier action support when creating IPSec chains in offload path. MLX5_IPSEC_CAP_PRIO should only be set if CONFIG_MLX5_CLS_ACT is enabled. If CONFIG_MLX5_CLS_ACT=n and MLX5_IPSEC_CAP_PRIO is set, configuring IPsec offload will fail due to the mlxx5 ipsec chain rules failing to be created due to lack of classifier action support. Fixes: fa5aa2f89073 ("net/mlx5e: Use chains for IPsec policy priority offload") Signed-off-by: Rahul Rameshbabu Reviewed-by: Leon Romanovsky Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index 6e00afe4671b..797db853de36 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -51,9 +51,10 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) MLX5_CAP_FLOWTABLE_NIC_RX(mdev, decap)) caps |= MLX5_IPSEC_CAP_PACKET_OFFLOAD; - if ((MLX5_CAP_FLOWTABLE_NIC_TX(mdev, ignore_flow_level) && - MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ignore_flow_level)) || - MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, ignore_flow_level)) + if (IS_ENABLED(CONFIG_MLX5_CLS_ACT) && + ((MLX5_CAP_FLOWTABLE_NIC_TX(mdev, ignore_flow_level) && + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ignore_flow_level)) || + MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, ignore_flow_level))) caps |= MLX5_IPSEC_CAP_PRIO; if (MLX5_CAP_FLOWTABLE_NIC_TX(mdev, -- cgit From 025f2b85a5e5a46df14ecf162c3c80a957a36d0b Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 30 Jul 2024 09:16:36 +0300 Subject: net/mlx5e: Fix CT entry update leaks of modify header context The cited commit allocates a new modify header to replace the old one when updating CT entry. But if failed to allocate a new one, eg. exceed the max number firmware can support, modify header will be an error pointer that will trigger a panic when deallocating it. And the old modify header point is copied to old attr. When the old attr is freed, the old modify header is lost. Fix it by restoring the old attr to attr when failed to allocate a new modify header context. So when the CT entry is freed, the right modify header context will be freed. And the panic of accessing error pointer is also fixed. Fixes: 94ceffb48eac ("net/mlx5e: Implement CT entry update") Signed-off-by: Chris Mi Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 8cf8ba2622f2..71a168746ebe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -932,6 +932,7 @@ err_rule: mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, mh); mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); err_mod_hdr: + *attr = *old_attr; kfree(old_attr); err_attr: kvfree(spec); -- cgit From 3f8e82a020a5c22f9b791f4ac499b8e18007fbda Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Tue, 30 Jul 2024 09:16:37 +0300 Subject: net/mlx5e: Add a check for the return value from mlx5_port_set_eth_ptys Since the documentation for mlx5_toggle_port_link states that it should only be used after setting the port register, we add a check for the return value from mlx5_port_set_eth_ptys to ensure the register was successfully set before calling it. Fixes: 667daedaecd1 ("net/mlx5e: Toggle link only after modifying port parameters") Signed-off-by: Shahar Shitrit Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Link: https://patch.msgid.link/20240730061638.1831002-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 00d5661dc62e..36845872ae94 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1409,7 +1409,12 @@ static int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, if (!an_changes && link_modes == eproto.admin) goto out; - mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext); + err = mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext); + if (err) { + netdev_err(priv->netdev, "%s: failed to set ptys reg: %d\n", __func__, err); + goto out; + } + mlx5_toggle_port_link(mdev); out: -- cgit From c4d6a347ba7babdf9d90a0eb24048c266cae0532 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 30 Jul 2024 08:31:04 +0200 Subject: net: wan: fsl_qmc_hdlc: Convert carrier_lock spinlock to a mutex The carrier_lock spinlock protects the carrier detection. While it is held, framer_get_status() is called which in turn takes a mutex. This is not correct and can lead to a deadlock. A run with PROVE_LOCKING enabled detected the issue: [ BUG: Invalid wait context ] ... c204ddbc (&framer->mutex){+.+.}-{3:3}, at: framer_get_status+0x40/0x78 other info that might help us debug this: context-{4:4} 2 locks held by ifconfig/146: #0: c0926a38 (rtnl_mutex){+.+.}-{3:3}, at: devinet_ioctl+0x12c/0x664 #1: c2006a40 (&qmc_hdlc->carrier_lock){....}-{2:2}, at: qmc_hdlc_framer_set_carrier+0x30/0x98 Avoid the spinlock usage and convert carrier_lock to a mutex. Fixes: 54762918ca85 ("net: wan: fsl_qmc_hdlc: Add framer support") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240730063104.179553-1-herve.codina@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_qmc_hdlc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c index c5e7ca793c43..64b4bfa6fea7 100644 --- a/drivers/net/wan/fsl_qmc_hdlc.c +++ b/drivers/net/wan/fsl_qmc_hdlc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,7 @@ struct qmc_hdlc { struct qmc_chan *qmc_chan; struct net_device *netdev; struct framer *framer; - spinlock_t carrier_lock; /* Protect carrier detection */ + struct mutex carrier_lock; /* Protect carrier detection */ struct notifier_block nb; bool is_crc32; spinlock_t tx_lock; /* Protect tx descriptors */ @@ -60,7 +61,7 @@ static int qmc_hdlc_framer_set_carrier(struct qmc_hdlc *qmc_hdlc) if (!qmc_hdlc->framer) return 0; - guard(spinlock_irqsave)(&qmc_hdlc->carrier_lock); + guard(mutex)(&qmc_hdlc->carrier_lock); ret = framer_get_status(qmc_hdlc->framer, &framer_status); if (ret) { @@ -706,7 +707,7 @@ static int qmc_hdlc_probe(struct platform_device *pdev) qmc_hdlc->dev = dev; spin_lock_init(&qmc_hdlc->tx_lock); - spin_lock_init(&qmc_hdlc->carrier_lock); + mutex_init(&qmc_hdlc->carrier_lock); qmc_hdlc->qmc_chan = devm_qmc_chan_get_bychild(dev, dev->of_node); if (IS_ERR(qmc_hdlc->qmc_chan)) -- cgit From e549360069b4a57e111b8222fc072f3c7c1688ab Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 30 Jul 2024 08:31:33 +0200 Subject: net: wan: fsl_qmc_hdlc: Discard received CRC Received frame from QMC contains the CRC. Upper layers don't need this CRC and tcpdump mentioned trailing junk data due to this CRC presence. As some other HDLC driver, simply discard this CRC. Fixes: d0f2258e79fd ("net: wan: Add support for QMC HDLC") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240730063133.179598-1-herve.codina@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_qmc_hdlc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c index 64b4bfa6fea7..8fcfbde31a1c 100644 --- a/drivers/net/wan/fsl_qmc_hdlc.c +++ b/drivers/net/wan/fsl_qmc_hdlc.c @@ -250,6 +250,7 @@ static void qmc_hcld_recv_complete(void *context, size_t length, unsigned int fl struct qmc_hdlc_desc *desc = context; struct net_device *netdev; struct qmc_hdlc *qmc_hdlc; + size_t crc_size; int ret; netdev = desc->netdev; @@ -268,15 +269,26 @@ static void qmc_hcld_recv_complete(void *context, size_t length, unsigned int fl if (flags & QMC_RX_FLAG_HDLC_CRC) /* CRC error */ netdev->stats.rx_crc_errors++; kfree_skb(desc->skb); - } else { - netdev->stats.rx_packets++; - netdev->stats.rx_bytes += length; + goto re_queue; + } - skb_put(desc->skb, length); - desc->skb->protocol = hdlc_type_trans(desc->skb, netdev); - netif_rx(desc->skb); + /* Discard the CRC */ + crc_size = qmc_hdlc->is_crc32 ? 4 : 2; + if (length < crc_size) { + netdev->stats.rx_length_errors++; + kfree_skb(desc->skb); + goto re_queue; } + length -= crc_size; + + netdev->stats.rx_packets++; + netdev->stats.rx_bytes += length; + + skb_put(desc->skb, length); + desc->skb->protocol = hdlc_type_trans(desc->skb, netdev); + netif_rx(desc->skb); +re_queue: /* Re-queue a transfer using the same descriptor */ ret = qmc_hdlc_recv_queue(qmc_hdlc, desc, desc->dma_size); if (ret) { -- cgit From 4efce726e0cbc723178eea5b944e13775f628ecc Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 30 Jul 2024 12:40:16 +0200 Subject: net: MAINTAINERS: Demote Qualcomm IPA to "maintained" To the best of my knowledge, Alex Elder is not being paid to support Qualcomm IPA networking drivers, so drop the status from "supported" to "maintained". Signed-off-by: Krzysztof Kozlowski Acked-by: Alex Elder Link: https://patch.msgid.link/20240730104016.22103-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1ca8e36e49bd..11b325268e2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18555,7 +18555,7 @@ F: drivers/usb/misc/qcom_eud.c QCOM IPA DRIVER M: Alex Elder L: netdev@vger.kernel.org -S: Supported +S: Maintained F: drivers/net/ipa/ QEMU MACHINE EMULATOR AND VIRTUALIZER SUPPORT -- cgit From b9e7fc0aeda79031a101610b2fcb12bf031056e9 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Tue, 30 Jul 2024 10:33:02 -0700 Subject: igc: Fix double reset adapter triggered from a single taprio cmd Following the implementation of "igc: Add TransmissionOverrun counter" patch, when a taprio command is triggered by user, igc processes two commands: TAPRIO_CMD_REPLACE followed by TAPRIO_CMD_STATS. However, both commands unconditionally pass through igc_tsn_offload_apply() which evaluates and triggers reset adapter. The double reset causes issues in the calculation of adapter->qbv_count in igc. TAPRIO_CMD_REPLACE command is expected to reset the adapter since it activates qbv. It's unexpected for TAPRIO_CMD_STATS to do the same because it doesn't configure any driver-specific TSN settings. So, the evaluation in igc_tsn_offload_apply() isn't needed for TAPRIO_CMD_STATS. To address this, commands parsing are relocated to igc_tsn_enable_qbv_scheduling(). Commands that don't require an adapter reset will exit after processing, thus avoiding igc_tsn_offload_apply(). Fixes: d3750076d464 ("igc: Add TransmissionOverrun counter") Signed-off-by: Faizal Rahim Acked-by: Vinicius Costa Gomes Reviewed-by: Vladimir Oltean Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20240730173304.865479-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 33 ++++++++++++++++--------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index cb5c7b09e8a0..8daf938afc36 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6306,21 +6306,6 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, size_t n; int i; - switch (qopt->cmd) { - case TAPRIO_CMD_REPLACE: - break; - case TAPRIO_CMD_DESTROY: - return igc_tsn_clear_schedule(adapter); - case TAPRIO_CMD_STATS: - igc_taprio_stats(adapter->netdev, &qopt->stats); - return 0; - case TAPRIO_CMD_QUEUE_STATS: - igc_taprio_queue_stats(adapter->netdev, &qopt->queue_stats); - return 0; - default: - return -EOPNOTSUPP; - } - if (qopt->base_time < 0) return -ERANGE; @@ -6429,7 +6414,23 @@ static int igc_tsn_enable_qbv_scheduling(struct igc_adapter *adapter, if (hw->mac.type != igc_i225) return -EOPNOTSUPP; - err = igc_save_qbv_schedule(adapter, qopt); + switch (qopt->cmd) { + case TAPRIO_CMD_REPLACE: + err = igc_save_qbv_schedule(adapter, qopt); + break; + case TAPRIO_CMD_DESTROY: + err = igc_tsn_clear_schedule(adapter); + break; + case TAPRIO_CMD_STATS: + igc_taprio_stats(adapter->netdev, &qopt->stats); + return 0; + case TAPRIO_CMD_QUEUE_STATS: + igc_taprio_queue_stats(adapter->netdev, &qopt->queue_stats); + return 0; + default: + return -EOPNOTSUPP; + } + if (err) return err; -- cgit From a46c68debf3be3a477a69ccbf0a1d050df841676 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Mon, 29 Jul 2024 17:17:48 -0700 Subject: ipv6: fix ndisc_is_useropt() handling for PIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current logic only works if the PIO is between two other ND user options. This fixes it so that the PIO can also be either before or after other ND user options (for example the first or last option in the RA). side note: there's actually Android tests verifying a portion of the old broken behaviour, so: https://android-review.googlesource.com/c/kernel/tests/+/3196704 fixes those up. Cc: Jen Linkova Cc: Lorenzo Colitti Cc: Patrick Rohr Cc: David Ahern Cc: YOSHIFUJI Hideaki / 吉藤英明 Cc: Jakub Kicinski Signed-off-by: Maciej Żenczykowski Fixes: 048c796beb6e ("ipv6: adjust ndisc_is_useropt() to also return true for PIO") Link: https://patch.msgid.link/20240730001748.147636-1-maze@google.com Signed-off-by: Paolo Abeni --- net/ipv6/ndisc.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 70a0b2ad6bd7..b8eec1b6cc2c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -227,6 +227,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { + bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; @@ -262,22 +263,23 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, break; #endif default: - if (ndisc_is_useropt(dev, nd_opt)) { - ndopts->nd_useropts_end = nd_opt; - if (!ndopts->nd_useropts) - ndopts->nd_useropts = nd_opt; - } else { - /* - * Unknown options must be silently ignored, - * to accommodate future extension to the - * protocol. - */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); - } + unknown = true; + } + if (ndisc_is_useropt(dev, nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else if (unknown) { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + ND_PRINTK(2, notice, + "%s: ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, + nd_opt->nd_opt_len); } next_opt: opt_len -= l; -- cgit From 0a567c2a10033bf04ed618368d179bce6977984b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 Jul 2024 12:10:14 +0200 Subject: mptcp: fix bad RCVPRUNED mib accounting Since its introduction, the mentioned MIB accounted for the wrong event: wake-up being skipped as not-needed on some edge condition instead of incoming skb being dropped after landing in the (subflow) receive queue. Move the increment in the correct location. Fixes: ce599c516386 ("mptcp: properly account bulk freed memory") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a2fc54ed68c0..0d536b183a6c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -350,8 +350,10 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; + } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; @@ -844,10 +846,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (__mptcp_rmem(sk) > sk_rbuf) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); + if (__mptcp_rmem(sk) > sk_rbuf) return; - } /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); -- cgit From 68cc924729ffcfe90d0383177192030a9aeb2ee4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 Jul 2024 12:10:15 +0200 Subject: mptcp: fix duplicate data handling When a subflow receives and discards duplicate data, the mptcp stack assumes that the consumed offset inside the current skb is zero. With multiple subflows receiving data simultaneously such assertion does not held true. As a result the subflow-level copied_seq will be incorrectly increased and later on the same subflow will observe a bad mapping, leading to subflow reset. Address the issue taking into account the skb consumed offset in mptcp_subflow_discard_data(). Fixes: 04e4cd4f7ca4 ("mptcp: cleanup mptcp_subflow_discard_data()") Cc: stable@vger.kernel.org Link: https://github.com/multipath-tcp/mptcp_net-next/issues/501 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/subflow.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 0e4b5bfbeaa1..a21c712350c3 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1230,14 +1230,22 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - u32 incr; + struct tcp_sock *tp = tcp_sk(ssk); + u32 offset, incr, avail_len; - incr = limit >= skb->len ? skb->len + fin : limit; + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(offset > skb->len)) + goto out; + + avail_len = skb->len - offset; + incr = limit >= avail_len ? avail_len + fin : limit; - pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, - subflow->map_subflow_seq); + pr_debug("discarding=%d len=%d offset=%d seq=%d", incr, skb->len, + offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; + +out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) -- cgit