From aff234839f8b80ac101e6c2f14d0e44b236efa48 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 9 Dec 2022 16:44:46 +0000 Subject: KVM: arm64: PMU: Fix PMCR_EL0 reset value ARMV8_PMU_PMCR_N_MASK is an unshifted value which results in the wrong reset value for PMCR_EL0, so shift it to fix it. This fixes the following error when running qemu: $ qemu-system-aarch64 -cpu host -machine type=virt,accel=kvm -kernel ... target/arm/helper.c:1813: pmevcntr_rawwrite: Assertion `counter < pmu_num_counters(env)' failed. Fixes: 292e8f149476 ("KVM: arm64: PMU: Simplify PMCR_EL0 reset handling") Signed-off-by: James Clark Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221209164446.1972014-2-james.clark@arm.com --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d5ee52d6bf73..c6cbfe6b854b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -646,7 +646,7 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return; /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ - pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N_MASK; + pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; -- cgit From 1c0908d8e441631f5b8ba433523cf39339ee2ba0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Dec 2022 10:02:23 +0000 Subject: rtmutex: Add acquire semantics for rtmutex lock acquisition slow path Jan Kara reported the following bug triggering on 6.0.5-rt14 running dbench on XFS on arm64. kernel BUG at fs/inode.c:625! Internal error: Oops - BUG: 0 [#1] PREEMPT_RT SMP CPU: 11 PID: 6611 Comm: dbench Tainted: G E 6.0.0-rt14-rt+ #1 pc : clear_inode+0xa0/0xc0 lr : clear_inode+0x38/0xc0 Call trace: clear_inode+0xa0/0xc0 evict+0x160/0x180 iput+0x154/0x240 do_unlinkat+0x184/0x300 __arm64_sys_unlinkat+0x48/0xc0 el0_svc_common.constprop.4+0xe4/0x2c0 do_el0_svc+0xac/0x100 el0_svc+0x78/0x200 el0t_64_sync_handler+0x9c/0xc0 el0t_64_sync+0x19c/0x1a0 It also affects 6.1-rc7-rt5 and affects a preempt-rt fork of 5.14 so this is likely a bug that existed forever and only became visible when ARM support was added to preempt-rt. The same problem does not occur on x86-64 and he also reported that converting sb->s_inode_wblist_lock to raw_spinlock_t makes the problem disappear indicating that the RT spinlock variant is the problem. Which in turn means that RT mutexes on ARM64 and any other weakly ordered architecture are affected by this independent of RT. Will Deacon observed: "I'd be more inclined to be suspicious of the slowpath tbh, as we need to make sure that we have acquire semantics on all paths where the lock can be taken. Looking at the rtmutex code, this really isn't obvious to me -- for example, try_to_take_rt_mutex() appears to be able to return via the 'takeit' label without acquire semantics and it looks like we might be relying on the caller's subsequent _unlock_ of the wait_lock for ordering, but that will give us release semantics which aren't correct." Sebastian Andrzej Siewior prototyped a fix that does work based on that comment but it was a little bit overkill and added some fences that should not be necessary. The lock owner is updated with an IRQ-safe raw spinlock held, but the spin_unlock does not provide acquire semantics which are needed when acquiring a mutex. Adds the necessary acquire semantics for lock owner updates in the slow path acquisition and the waiter bit logic. It successfully completed 10 iterations of the dbench workload while the vanilla kernel fails on the first iteration. [ bigeasy@linutronix.de: Initial prototype fix ] Fixes: 700318d1d7b38 ("locking/rtmutex: Use acquire/release semantics") Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core") Reported-by: Jan Kara Signed-off-by: Mel Gorman Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20221202100223.6mevpbl7i6x5udfd@techsingularity.net --- kernel/locking/rtmutex.c | 55 ++++++++++++++++++++++++++++++++++++-------- kernel/locking/rtmutex_api.c | 6 ++--- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7779ee8abc2a..010cf4e6d0b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -89,15 +89,31 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, * set this bit before looking at the lock. */ -static __always_inline void -rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +static __always_inline struct task_struct * +rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - WRITE_ONCE(lock->owner, (struct task_struct *)val); + return (struct task_struct *)val; +} + +static __always_inline void +rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) +{ + /* + * lock->wait_lock is held but explicit acquire semantics are needed + * for a new lock owner so WRITE_ONCE is insufficient. + */ + xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner)); +} + +static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) +{ + /* lock->wait_lock is held so the unlock provides release semantics. */ + WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) @@ -106,7 +122,8 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) +static __always_inline void +fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -172,8 +189,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock) * still set. */ owner = READ_ONCE(*p); - if (owner & RT_MUTEX_HAS_WAITERS) - WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + if (owner & RT_MUTEX_HAS_WAITERS) { + /* + * See rt_mutex_set_owner() and rt_mutex_clear_owner() on + * why xchg_acquire() is used for updating owner for + * locking and WRITE_ONCE() for unlocking. + * + * WRITE_ONCE() would work for the acquire case too, but + * in case that the lock acquisition failed it might + * force other lockers into the slow path unnecessarily. + */ + if (acquire_lock) + xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS); + else + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } } /* @@ -208,6 +238,13 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) owner = *p; } while (cmpxchg_relaxed(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); + + /* + * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE + * operations in the event of contention. Ensure the successful + * cmpxchg is visible. + */ + smp_mb__after_atomic(); } /* @@ -1243,7 +1280,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) * try_to_take_rt_mutex() sets the lock waiters bit * unconditionally. Clean this up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); return ret; } @@ -1604,7 +1641,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit * unconditionally. We might have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); trace_contention_end(lock, ret); @@ -1719,7 +1756,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) * try_to_take_rt_mutex() sets the waiter bit unconditionally. * We might have to fix that up: */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 900220941caa..cb9fdff76a8a 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -267,7 +267,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); + rt_mutex_clear_owner(lock); } /** @@ -382,7 +382,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, true); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -438,7 +438,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ - fixup_rt_mutex_waiters(lock); + fixup_rt_mutex_waiters(lock, false); raw_spin_unlock_irq(&lock->wait_lock); -- cgit From f728a5ea27c92133893590e731ce10f6561ced87 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 6 Dec 2022 14:07:49 +0100 Subject: dma-buf: fix dma_buf_export init order v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The init order and resulting error handling in dma_buf_export was pretty messy. Subordinate objects like the file and the sysfs kernel objects were initializing and wiring itself up with the object in the wrong order resulting not only in complicating and partially incorrect error handling, but also in publishing only halve initialized DMA-buf objects. Clean this up thoughtfully by allocating the file independent of the DMA-buf object. Then allocate and initialize the DMA-buf object itself, before publishing it through sysfs. If everything works as expected the file is then connected with the DMA-buf object and publish it through debugfs. Also adds the missing dma_resv_fini() into the error handling. v2: add some missing changes to dma_bug_getfile() and a missing NULL check in dma_buf_file_release() Signed-off-by: Christian König Reviewed-by: Michael J. Ruhl Reviewed-by: T.J. Mercier Acked-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20221209071535.933698-1-christian.koenig@amd.com --- drivers/dma-buf/dma-buf-sysfs-stats.c | 7 +-- drivers/dma-buf/dma-buf-sysfs-stats.h | 4 +- drivers/dma-buf/dma-buf.c | 84 ++++++++++++++++------------------- 3 files changed, 43 insertions(+), 52 deletions(-) diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 2bba0babcb62..4b680e10c15a 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -168,14 +168,11 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } -int dma_buf_stats_setup(struct dma_buf *dmabuf) +int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { struct dma_buf_sysfs_entry *sysfs_entry; int ret; - if (!dmabuf || !dmabuf->file) - return -EINVAL; - if (!dmabuf->exp_name) { pr_err("exporter name must not be empty if stats needed\n"); return -EINVAL; @@ -192,7 +189,7 @@ int dma_buf_stats_setup(struct dma_buf *dmabuf) /* create the directory for buffer stats */ ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, - "%lu", file_inode(dmabuf->file)->i_ino); + "%lu", file_inode(file)->i_ino); if (ret) goto err_sysfs_dmabuf; diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.h b/drivers/dma-buf/dma-buf-sysfs-stats.h index a49c6e2650cc..7a8a995b75ba 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.h +++ b/drivers/dma-buf/dma-buf-sysfs-stats.h @@ -13,7 +13,7 @@ int dma_buf_init_sysfs_statistics(void); void dma_buf_uninit_sysfs_statistics(void); -int dma_buf_stats_setup(struct dma_buf *dmabuf); +int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file); void dma_buf_stats_teardown(struct dma_buf *dmabuf); #else @@ -25,7 +25,7 @@ static inline int dma_buf_init_sysfs_statistics(void) static inline void dma_buf_uninit_sysfs_statistics(void) {} -static inline int dma_buf_stats_setup(struct dma_buf *dmabuf) +static inline int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { return 0; } diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index e6f36c014c4c..eb6b59363c4f 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -95,10 +95,11 @@ static int dma_buf_file_release(struct inode *inode, struct file *file) return -EINVAL; dmabuf = file->private_data; - - mutex_lock(&db_list.lock); - list_del(&dmabuf->list_node); - mutex_unlock(&db_list.lock); + if (dmabuf) { + mutex_lock(&db_list.lock); + list_del(&dmabuf->list_node); + mutex_unlock(&db_list.lock); + } return 0; } @@ -523,17 +524,17 @@ static inline int is_dma_buf_file(struct file *file) return file->f_op == &dma_buf_fops; } -static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) +static struct file *dma_buf_getfile(size_t size, int flags) { static atomic64_t dmabuf_inode = ATOMIC64_INIT(0); - struct file *file; struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb); + struct file *file; if (IS_ERR(inode)) return ERR_CAST(inode); - inode->i_size = dmabuf->size; - inode_set_bytes(inode, dmabuf->size); + inode->i_size = size; + inode_set_bytes(inode, size); /* * The ->i_ino acquired from get_next_ino() is not unique thus @@ -547,8 +548,6 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) flags, &dma_buf_fops); if (IS_ERR(file)) goto err_alloc_file; - file->private_data = dmabuf; - file->f_path.dentry->d_fsdata = dmabuf; return file; @@ -614,19 +613,11 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) size_t alloc_size = sizeof(struct dma_buf); int ret; - if (!exp_info->resv) - alloc_size += sizeof(struct dma_resv); - else - /* prevent &dma_buf[1] == dma_buf->resv */ - alloc_size += 1; - - if (WARN_ON(!exp_info->priv - || !exp_info->ops - || !exp_info->ops->map_dma_buf - || !exp_info->ops->unmap_dma_buf - || !exp_info->ops->release)) { + if (WARN_ON(!exp_info->priv || !exp_info->ops + || !exp_info->ops->map_dma_buf + || !exp_info->ops->unmap_dma_buf + || !exp_info->ops->release)) return ERR_PTR(-EINVAL); - } if (WARN_ON(exp_info->ops->cache_sgt_mapping && (exp_info->ops->pin || exp_info->ops->unpin))) @@ -638,10 +629,21 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) if (!try_module_get(exp_info->owner)) return ERR_PTR(-ENOENT); + file = dma_buf_getfile(exp_info->size, exp_info->flags); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err_module; + } + + if (!exp_info->resv) + alloc_size += sizeof(struct dma_resv); + else + /* prevent &dma_buf[1] == dma_buf->resv */ + alloc_size += 1; dmabuf = kzalloc(alloc_size, GFP_KERNEL); if (!dmabuf) { ret = -ENOMEM; - goto err_module; + goto err_file; } dmabuf->priv = exp_info->priv; @@ -653,44 +655,36 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) init_waitqueue_head(&dmabuf->poll); dmabuf->cb_in.poll = dmabuf->cb_out.poll = &dmabuf->poll; dmabuf->cb_in.active = dmabuf->cb_out.active = 0; + mutex_init(&dmabuf->lock); + INIT_LIST_HEAD(&dmabuf->attachments); if (!resv) { - resv = (struct dma_resv *)&dmabuf[1]; - dma_resv_init(resv); + dmabuf->resv = (struct dma_resv *)&dmabuf[1]; + dma_resv_init(dmabuf->resv); + } else { + dmabuf->resv = resv; } - dmabuf->resv = resv; - file = dma_buf_getfile(dmabuf, exp_info->flags); - if (IS_ERR(file)) { - ret = PTR_ERR(file); + ret = dma_buf_stats_setup(dmabuf, file); + if (ret) goto err_dmabuf; - } + file->private_data = dmabuf; + file->f_path.dentry->d_fsdata = dmabuf; dmabuf->file = file; - mutex_init(&dmabuf->lock); - INIT_LIST_HEAD(&dmabuf->attachments); - mutex_lock(&db_list.lock); list_add(&dmabuf->list_node, &db_list.head); mutex_unlock(&db_list.lock); - ret = dma_buf_stats_setup(dmabuf); - if (ret) - goto err_sysfs; - return dmabuf; -err_sysfs: - /* - * Set file->f_path.dentry->d_fsdata to NULL so that when - * dma_buf_release() gets invoked by dentry_ops, it exits - * early before calling the release() dma_buf op. - */ - file->f_path.dentry->d_fsdata = NULL; - fput(file); err_dmabuf: + if (!resv) + dma_resv_fini(dmabuf->resv); kfree(dmabuf); +err_file: + fput(file); err_module: module_put(exp_info->owner); return ERR_PTR(ret); -- cgit From 01258b62c62710297dab4e2b72f46e01be392cc6 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 9 Dec 2022 10:59:37 +0100 Subject: wifi: ti: remove obsolete lines in the Makefile Commit 06463f6e98df ("wifi: wl1251: drop support for platform data") removes TI WiLink platform data, but leaves some dead lines in the Makefile. Remove these obsolete lines in the Makefile. Signed-off-by: Lukas Bulwahn Reviewed-by: Dmitry Torokhov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221209095937.17773-1-lukas.bulwahn@gmail.com --- drivers/net/wireless/ti/Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/wireless/ti/Makefile b/drivers/net/wireless/ti/Makefile index 0530dd744275..05ee016594f8 100644 --- a/drivers/net/wireless/ti/Makefile +++ b/drivers/net/wireless/ti/Makefile @@ -3,6 +3,3 @@ obj-$(CONFIG_WLCORE) += wlcore/ obj-$(CONFIG_WL12XX) += wl12xx/ obj-$(CONFIG_WL1251) += wl1251/ obj-$(CONFIG_WL18XX) += wl18xx/ - -# small builtin driver bit -obj-$(CONFIG_WILINK_PLATFORM_DATA) += wilink_platform_data.o -- cgit From abe3bf7425fb695a9b37394af18b9ea58a800802 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 12 Dec 2022 21:14:17 +0100 Subject: btrfs: fix an error handling path in btrfs_rename() If new_whiteout_inode() fails, some resources need to be freed. Add the missing goto to the error handling path. Fixes: ab3c5c18e8fa ("btrfs: setup qstr from dentrys using fscrypt helper") Reviewed-by: Sweet Tea Dorminy Signed-off-by: Christophe JAILLET Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 905ea19df125..bfcbe64eb8b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9377,8 +9377,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (flags & RENAME_WHITEOUT) { whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); - if (!whiteout_args.inode) - return -ENOMEM; + if (!whiteout_args.inode) { + ret = -ENOMEM; + goto out_fscrypt_names; + } ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); if (ret) goto out_whiteout_inode; -- cgit From db0a4a7b8e95f9312a59a67cbd5bc589f090e13d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 12 Dec 2022 21:01:43 +0100 Subject: btrfs: fix an error handling path in btrfs_defrag_leaves() All error handling paths end to 'out', except this memory allocation failure. This is spurious. So branch to the error handling path also in this case. It will add a call to: memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); Fixes: 6702ed490ca0 ("Btrfs: Add run time btree defrag, and an ioctl to force btree defrag") Signed-off-by: Christophe JAILLET Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 0a3c261b69c9..d81b764a7644 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -358,8 +358,10 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } level = btrfs_header_level(root->node); -- cgit From c68f72900a12a56c5e9890e6f2ca5119234c9a75 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Dec 2022 10:42:26 +0000 Subject: btrfs: fix leak of fs devices after removing btrfs module When removing the btrfs module we are not calling btrfs_cleanup_fs_uuids() which results in leaking btrfs_fs_devices structures and other resources. This is a regression recently introduced by a refactoring of the module initialization and exit sequence, which simply removed the call to btrfs_cleanup_fs_uuids() in the exit path, resulting in the leaks. So fix this by calling btrfs_cleanup_fs_uuids() at exit_btrfs_fs(). Fixes: 5565b8e0adcd ("btrfs: make module init/exit match their sequence") Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 93f52ee85f6f..d5de18d6517e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2514,6 +2514,7 @@ static __always_inline void btrfs_exit_btrfs_fs(void) static void __exit exit_btrfs_fs(void) { btrfs_exit_btrfs_fs(); + btrfs_cleanup_fs_uuids(); } static int __init init_btrfs_fs(void) -- cgit From f1f0460c0ca97a4a6570f211c81579294a6cc7be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 13 Dec 2022 16:57:44 -0500 Subject: btrfs: restore BTRFS_SEQ_LAST when looking up qgroup backref lookup In the patch a2c8d27e5ee8 ("btrfs: use a structure to pass arguments to backref walking functions") Filipe converted everybody to using a new context struct to use for backref lookups, but accidentally dropped the BTRFS_SEQ_LAST usage that exists for qgroups. Add this back so we have the previous behavior. Fixes: a2c8d27e5ee8 ("btrfs: use a structure to pass arguments to backref walking functions") Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5c636e00d77d..d275bf24b250 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2787,6 +2787,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * current root. It's safe inside commit_transaction(). */ ctx.trans = trans; + ctx.time_seq = BTRFS_SEQ_LAST; ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; -- cgit From 0a3212de8ab3e2ce5808c6265855e528d4a6767b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 14 Dec 2022 11:06:07 +0900 Subject: btrfs: fix trace event name typo for FLUSH_DELAYED_REFS Fix a typo of printing FLUSH_DELAYED_REFS event in flush_space() as FLUSH_ELAYED_REFS. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0bce0b4ff2fa..6548b5b5aa60 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -98,7 +98,7 @@ struct raid56_bio_trace_info; EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \ EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \ - EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \ + EM( FLUSH_DELAYED_REFS, "FLUSH_DELAYED_REFS") \ EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ -- cgit From b18cba09e374637a0a3759d856a6bca94c133952 Mon Sep 17 00:00:00 2001 From: minoura makoto Date: Tue, 13 Dec 2022 13:14:31 +0900 Subject: SUNRPC: ensure the matching upcall is in-flight upon downcall Commit 9130b8dbc6ac ("SUNRPC: allow for upcalls for the same uid but different gss service") introduced `auth` argument to __gss_find_upcall(), but in gss_pipe_downcall() it was left as NULL since it (and auth->service) was not (yet) determined. When multiple upcalls with the same uid and different service are ongoing, it could happen that __gss_find_upcall(), which returns the first match found in the pipe->in_downcall list, could not find the correct gss_msg corresponding to the downcall we are looking for. Moreover, it might return a msg which is not sent to rpc.gssd yet. We could see mount.nfs process hung in D state with multiple mount.nfs are executed in parallel. The call trace below is of CentOS 7.9 kernel-3.10.0-1160.24.1.el7.x86_64 but we observed the same hang w/ elrepo kernel-ml-6.0.7-1.el7. PID: 71258 TASK: ffff91ebd4be0000 CPU: 36 COMMAND: "mount.nfs" #0 [ffff9203ca3234f8] __schedule at ffffffffa3b8899f #1 [ffff9203ca323580] schedule at ffffffffa3b88eb9 #2 [ffff9203ca323590] gss_cred_init at ffffffffc0355818 [auth_rpcgss] #3 [ffff9203ca323658] rpcauth_lookup_credcache at ffffffffc0421ebc [sunrpc] #4 [ffff9203ca3236d8] gss_lookup_cred at ffffffffc0353633 [auth_rpcgss] #5 [ffff9203ca3236e8] rpcauth_lookupcred at ffffffffc0421581 [sunrpc] #6 [ffff9203ca323740] rpcauth_refreshcred at ffffffffc04223d3 [sunrpc] #7 [ffff9203ca3237a0] call_refresh at ffffffffc04103dc [sunrpc] #8 [ffff9203ca3237b8] __rpc_execute at ffffffffc041e1c9 [sunrpc] #9 [ffff9203ca323820] rpc_execute at ffffffffc0420a48 [sunrpc] The scenario is like this. Let's say there are two upcalls for services A and B, A -> B in pipe->in_downcall, B -> A in pipe->pipe. When rpc.gssd reads pipe to get the upcall msg corresponding to service B from pipe->pipe and then writes the response, in gss_pipe_downcall the msg corresponding to service A will be picked because only uid is used to find the msg and it is before the one for B in pipe->in_downcall. And the process waiting for the msg corresponding to service A will be woken up. Actual scheduing of that process might be after rpc.gssd processes the next msg. In rpc_pipe_generic_upcall it clears msg->errno (for A). The process is scheduled to see gss_msg->ctx == NULL and gss_msg->msg.errno == 0, therefore it cannot break the loop in gss_create_upcall and is never woken up after that. This patch adds a simple check to ensure that a msg which is not sent to rpc.gssd yet is not chosen as the matching upcall upon receiving a downcall. Signed-off-by: minoura makoto Signed-off-by: Hiroshi Shimamoto Tested-by: Hiroshi Shimamoto Cc: Trond Myklebust Fixes: 9130b8dbc6ac ("SUNRPC: allow for upcalls for same uid but different gss service") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/rpc_pipe_fs.h | 5 +++++ net/sunrpc/auth_gss/auth_gss.c | 19 +++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index cd188a527d16..3b35b6f6533a 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -92,6 +92,11 @@ extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *, char __user *, size_t); extern int rpc_queue_upcall(struct rpc_pipe *, struct rpc_pipe_msg *); +/* returns true if the msg is in-flight, i.e., already eaten by the peer */ +static inline bool rpc_msg_is_inflight(const struct rpc_pipe_msg *msg) { + return (msg->copied != 0 && list_empty(&msg->list)); +} + struct rpc_clnt; extern struct dentry *rpc_create_client_dir(struct dentry *, const char *, struct rpc_clnt *); extern int rpc_remove_client_dir(struct rpc_clnt *); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 7bb247c51e2f..2d7b1e03110a 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -302,7 +302,7 @@ __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; - if (auth && pos->auth->service != auth->service) + if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; @@ -686,6 +686,21 @@ out: return err; } +static struct gss_upcall_msg * +gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &pipe->in_downcall, list) { + if (!uid_eq(pos->uid, uid)) + continue; + if (!rpc_msg_is_inflight(&pos->msg)) + continue; + refcount_inc(&pos->count); + return pos; + } + return NULL; +} + #define MSG_BUF_MAXSIZE 1024 static ssize_t @@ -732,7 +747,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); - gss_msg = __gss_find_upcall(pipe, uid, NULL); + gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; -- cgit From 4e699e34f923188175986ad8a74ab99f7034075e Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Fri, 16 Dec 2022 11:05:26 +0800 Subject: drm/plane-helper: Add the missing declaration of drm_atomic_state Add the missing declaration of struct drm_atomic_state to fix the compile error below: error: 'struct drm_atomic_state' declared inside parameter list will not be visible outside of this definition or declaration [-Werror] Signed-off-by: Ma Jun Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Fixes: 8401bd361f59 ("drm/plane-helper: Add a drm_plane_helper_atomic_check() helper") Cc: Javier Martinez Canillas Cc: Thomas Zimmermann Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v6.1+ Link: https://patchwork.freedesktop.org/patch/msgid/20221216030526.1335609-1-majun@amd.com --- include/drm/drm_plane_helper.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/drm/drm_plane_helper.h b/include/drm/drm_plane_helper.h index ff83d2621687..3a574e8cd22f 100644 --- a/include/drm/drm_plane_helper.h +++ b/include/drm/drm_plane_helper.h @@ -26,6 +26,7 @@ #include +struct drm_atomic_state; struct drm_crtc; struct drm_framebuffer; struct drm_modeset_acquire_ctx; -- cgit From cc074822465d18a2d39e0b3e2b48b6766a568db2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 17 Dec 2022 14:21:44 +0800 Subject: bpf: Define sock security related BTF IDs under CONFIG_SECURITY_NETWORK There are warnings reported from resolve_btfids when building vmlinux with CONFIG_SECURITY_NETWORK disabled: WARN: resolve_btfids: unresolved symbol bpf_lsm_sk_free_security WARN: resolve_btfids: unresolved symbol bpf_lsm_sk_alloc_security So only define BTF IDs for these LSM hooks when CONFIG_SECURITY_NETWORK is enabled. Fixes: c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted") Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221217062144.2507222-1-houtao@huaweicloud.com --- kernel/bpf/bpf_lsm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9ea42a45da47..a4a41ee3e80b 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -351,8 +351,10 @@ BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) BTF_ID(func, bpf_lsm_bpf_prog_free_security) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_task_free) BTF_SET_END(untrusted_lsm_hooks) -- cgit From 1c4c0b28b517d778d37900deedfe91088839f07a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 13 Dec 2022 23:15:04 +0200 Subject: wifi: iwlwifi: fw: skip PPAG for JF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For JF RFs we don't support PPAG, but many firmware images lie about it. Always skip support for JF to avoid firmware errors when sending the command. Reported-and-tested-by: Íñigo Huguet Link: https://lore.kernel.org/linux-wireless/CACT4oufQsqHGp6bah2c4+jPn2wG1oZqY=UKa_TmPx=F6Lxng8Q@mail.gmail.com Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221213225723.2a43415d8990.I9ac210740a45b41f1b2e15274e1daf4284f2808a@changeid --- drivers/net/wireless/intel/iwlwifi/fw/acpi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c index e6d64152c81a..a02e5a67b706 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c @@ -1106,6 +1106,11 @@ int iwl_read_ppag_table(struct iwl_fw_runtime *fwrt, union iwl_ppag_table_cmd *c int i, j, num_sub_bands; s8 *gain; + /* many firmware images for JF lie about this */ + if (CSR_HW_RFID_TYPE(fwrt->trans->hw_rf_id) == + CSR_HW_RFID_TYPE(CSR_HW_RF_ID_TYPE_JF)) + return -EOPNOTSUPP; + if (!fw_has_capa(&fwrt->fw->ucode_capa, IWL_UCODE_TLV_CAPA_SET_PPAG)) { IWL_DEBUG_RADIO(fwrt, "PPAG capability not supported by FW, command not sent.\n"); -- cgit From 37fc9ad1617a303bbfd28870eb25aaa4766e79ab Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:31:10 +0100 Subject: wifi: mt76: mt7996: select CONFIG_RELAY Without CONFIG_RELAY, the driver fails to link: ERROR: modpost: "relay_flush" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined! ERROR: modpost: "relay_switch_subbuf" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined! ERROR: modpost: "relay_open" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined! ERROR: modpost: "relay_reset" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined! ERROR: modpost: "relay_file_operations" [drivers/net/wireless/mediatek/mt76/mt7996/mt7996e.ko] undefined! The same change was done in mt7915 for the corresponding copy of the code. Fixes: 98686cd21624 ("wifi: mt76: mt7996: add driver for MediaTek Wi-Fi 7 (802.11be) devices") See-also: 988845c9361a ("mt76: mt7915: add support for passing chip/firmware debug data to user space") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221215163133.4152299-1-arnd@kernel.org --- drivers/net/wireless/mediatek/mt76/mt7996/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/Kconfig b/drivers/net/wireless/mediatek/mt76/mt7996/Kconfig index 5c5fc569e6d5..79fb47a73c91 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/Kconfig +++ b/drivers/net/wireless/mediatek/mt76/mt7996/Kconfig @@ -2,6 +2,7 @@ config MT7996E tristate "MediaTek MT7996 (PCIe) support" select MT76_CONNAC_LIB + select RELAY depends on MAC80211 depends on PCI help -- cgit From b7dc753fe33a707379e2254317794a4dad6c0fe2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:55:42 +0100 Subject: wifi: ath9k: use proper statements in conditionals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous cleanup patch accidentally broke some conditional expressions by replacing the safe "do {} while (0)" constructs with empty macros. gcc points this out when extra warnings are enabled: drivers/net/wireless/ath/ath9k/hif_usb.c: In function 'ath9k_skb_queue_complete': drivers/net/wireless/ath/ath9k/hif_usb.c:251:57: error: suggest braces around empty body in an 'else' statement [-Werror=empty-body] 251 | TX_STAT_INC(hif_dev, skb_failed); Make both sets of macros proper expressions again. Fixes: d7fc76039b74 ("ath9k: htc: clean up statistics macros") Signed-off-by: Arnd Bergmann Acked-by: Toke Høiland-Jørgensen Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221215165553.1950307-1-arnd@kernel.org --- drivers/net/wireless/ath/ath9k/htc.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/htc.h b/drivers/net/wireless/ath/ath9k/htc.h index 30f0765fb9fd..237f4ec2cffd 100644 --- a/drivers/net/wireless/ath/ath9k/htc.h +++ b/drivers/net/wireless/ath/ath9k/htc.h @@ -327,9 +327,9 @@ static inline struct ath9k_htc_tx_ctl *HTC_SKB_CB(struct sk_buff *skb) } #ifdef CONFIG_ATH9K_HTC_DEBUGFS -#define __STAT_SAFE(hif_dev, expr) ((hif_dev)->htc_handle->drv_priv ? (expr) : 0) -#define CAB_STAT_INC(priv) ((priv)->debug.tx_stats.cab_queued++) -#define TX_QSTAT_INC(priv, q) ((priv)->debug.tx_stats.queue_stats[q]++) +#define __STAT_SAFE(hif_dev, expr) do { ((hif_dev)->htc_handle->drv_priv ? (expr) : 0); } while (0) +#define CAB_STAT_INC(priv) do { ((priv)->debug.tx_stats.cab_queued++); } while (0) +#define TX_QSTAT_INC(priv, q) do { ((priv)->debug.tx_stats.queue_stats[q]++); } while (0) #define TX_STAT_INC(hif_dev, c) \ __STAT_SAFE((hif_dev), (hif_dev)->htc_handle->drv_priv->debug.tx_stats.c++) @@ -378,10 +378,10 @@ void ath9k_htc_get_et_stats(struct ieee80211_hw *hw, struct ethtool_stats *stats, u64 *data); #else -#define TX_STAT_INC(hif_dev, c) -#define TX_STAT_ADD(hif_dev, c, a) -#define RX_STAT_INC(hif_dev, c) -#define RX_STAT_ADD(hif_dev, c, a) +#define TX_STAT_INC(hif_dev, c) do { } while (0) +#define TX_STAT_ADD(hif_dev, c, a) do { } while (0) +#define RX_STAT_INC(hif_dev, c) do { } while (0) +#define RX_STAT_ADD(hif_dev, c, a) do { } while (0) #define CAB_STAT_INC(priv) #define TX_QSTAT_INC(priv, c) -- cgit From a6b9d2fa0024e7e399c26facd0fb466b7396e2b9 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 20 Dec 2022 12:31:29 -0500 Subject: pNFS/filelayout: Fix coalescing test for single DS When there is a single DS no striping constraints need to be placed on the IO. When such constraint is applied then buffered reads don't coalesce to the DS's rsize. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index ad34a33b0737..4974cd18ca46 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -783,6 +783,12 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, return &fl->generic_hdr; } +static bool +filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg) +{ + return flseg->num_fh > 1; +} + /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * @@ -803,6 +809,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, size = pnfs_generic_pg_test(pgio, prev, req); if (!size) return 0; + else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg))) + return size; /* see if req and prev are in the same stripe */ if (prev) { -- cgit From 560840afc3e63bbe5d9c5ef6b2ecf8f3589adff6 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 14 Dec 2022 15:05:08 -0800 Subject: btrfs: fix resolving backrefs for inline extent followed by prealloc If a file consists of an inline extent followed by a regular or prealloc extent, then a legitimate attempt to resolve a logical address in the non-inline region will result in add_all_parents reading the invalid offset field of the inline extent. If the inline extent item is placed in the leaf eb s.t. it is the first item, attempting to access the offset field will not only be meaningless, it will go past the end of the eb and cause this panic: [17.626048] BTRFS warning (device dm-2): bad eb member end: ptr 0x3fd4 start 30834688 member offset 16377 size 8 [17.631693] general protection fault, probably for non-canonical address 0x5088000000000: 0000 [#1] SMP PTI [17.635041] CPU: 2 PID: 1267 Comm: btrfs Not tainted 5.12.0-07246-g75175d5adc74-dirty #199 [17.637969] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [17.641995] RIP: 0010:btrfs_get_64+0xe7/0x110 [17.649890] RSP: 0018:ffffc90001f73a08 EFLAGS: 00010202 [17.651652] RAX: 0000000000000001 RBX: ffff88810c42d000 RCX: 0000000000000000 [17.653921] RDX: 0005088000000000 RSI: ffffc90001f73a0f RDI: 0000000000000001 [17.656174] RBP: 0000000000000ff9 R08: 0000000000000007 R09: c0000000fffeffff [17.658441] R10: ffffc90001f73790 R11: ffffc90001f73788 R12: ffff888106afe918 [17.661070] R13: 0000000000003fd4 R14: 0000000000003f6f R15: cdcdcdcdcdcdcdcd [17.663617] FS: 00007f64e7627d80(0000) GS:ffff888237c80000(0000) knlGS:0000000000000000 [17.666525] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [17.668664] CR2: 000055d4a39152e8 CR3: 000000010c596002 CR4: 0000000000770ee0 [17.671253] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [17.673634] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [17.676034] PKRU: 55555554 [17.677004] Call Trace: [17.677877] add_all_parents+0x276/0x480 [17.679325] find_parent_nodes+0xfae/0x1590 [17.680771] btrfs_find_all_leafs+0x5e/0xa0 [17.682217] iterate_extent_inodes+0xce/0x260 [17.683809] ? btrfs_inode_flags_to_xflags+0x50/0x50 [17.685597] ? iterate_inodes_from_logical+0xa1/0xd0 [17.687404] iterate_inodes_from_logical+0xa1/0xd0 [17.689121] ? btrfs_inode_flags_to_xflags+0x50/0x50 [17.691010] btrfs_ioctl_logical_to_ino+0x131/0x190 [17.692946] btrfs_ioctl+0x104a/0x2f60 [17.694384] ? selinux_file_ioctl+0x182/0x220 [17.695995] ? __x64_sys_ioctl+0x84/0xc0 [17.697394] __x64_sys_ioctl+0x84/0xc0 [17.698697] do_syscall_64+0x33/0x40 [17.700017] entry_SYSCALL_64_after_hwframe+0x44/0xae [17.701753] RIP: 0033:0x7f64e72761b7 [17.709355] RSP: 002b:00007ffefb067f58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [17.712088] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f64e72761b7 [17.714667] RDX: 00007ffefb067fb0 RSI: 00000000c0389424 RDI: 0000000000000003 [17.717386] RBP: 00007ffefb06d188 R08: 000055d4a390d2b0 R09: 00007f64e7340a60 [17.719938] R10: 0000000000000231 R11: 0000000000000246 R12: 0000000000000001 [17.722383] R13: 0000000000000000 R14: 00000000c0389424 R15: 000055d4a38fd2a0 [17.724839] Modules linked in: Fix the bug by detecting the inline extent item in add_all_parents and skipping to the next extent item. CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/backref.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 21c92c74bf71..46851511b661 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -484,6 +484,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; u64 data_offset; + u8 type; if (level != 0) { eb = path->nodes[level]; @@ -538,6 +539,9 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(eb, fi); + if (type == BTRFS_FILE_EXTENT_INLINE) + goto next; disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); data_offset = btrfs_file_extent_offset(eb, fi); -- cgit From e7fc357ec03ee109da503af0dd31bbf68514e481 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Dec 2022 11:48:00 -0500 Subject: btrfs: scrub: fix uninitialized return value in recover_scrub_rbio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 75b470332965 ("btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap") introduced an uninitialized return variable. This can be caught by gcc 12.1 by -Wmaybe-uninitialized: CC [M] fs/btrfs/raid56.o fs/btrfs/raid56.c: In function ‘scrub_rbio’: fs/btrfs/raid56.c:2801:15: warning: ‘ret’ may be used uninitialized [-Wmaybe-uninitialized] 2801 | ret = recover_scrub_rbio(rbio); | ^~~~~~~~~~~~~~~~~~~~~~~~ fs/btrfs/raid56.c:2649:13: note: ‘ret’ was declared here 2649 | int ret; The warning is disabled by default so we haven't caught that. Due to the bug the raid56 scrub fstests have been failing since the patch was merged, so initialize that. Fixes: 75b470332965 ("btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap") Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2d90a6b5eb00..6a2cf754912d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2646,7 +2646,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) void **pointers = NULL; void **unmap_array = NULL; int sector_nr; - int ret; + int ret = 0; /* * @pointers array stores the pointer for each sector. -- cgit From fee4c19937439693f2420a916169d08e88576e8e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Dec 2022 11:13:33 +0000 Subject: btrfs: fix fscrypt name leak after failure to join log transaction When logging a new name, we don't expect to fail joining a log transaction since we know at least one of the inodes was logged before in the current transaction. However if we fail for some unexpected reason, we end up not freeing the fscrypt name we previously allocated. So fix that by freeing the name in case we failed to join a log transaction. Fixes: ab3c5c18e8fa ("btrfs: setup qstr from dentrys using fscrypt helper") Reviewed-by: Sweet Tea Dorminy Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a3c43f0b1c95..fb52aa060093 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7459,8 +7459,11 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * not fail, but if it does, it's not serious, just bail out and * mark the log for a full commit. */ - if (WARN_ON_ONCE(ret < 0)) + if (WARN_ON_ONCE(ret < 0)) { + fscrypt_free_filename(&fname); goto out; + } + log_pinned = true; path = btrfs_alloc_path(); -- cgit From 54c3f1a81421f85e60ae2eaae7be3727a09916ee Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Dec 2022 16:47:00 -0800 Subject: bpf: pull before calling skb_postpull_rcsum() Anand hit a BUG() when pulling off headers on egress to a SW tunnel. We get to skb_checksum_help() with an invalid checksum offset (commit d7ea0d9df2a6 ("net: remove two BUG() from skb_checksum_help()") converted those BUGs to WARN_ONs()). He points out oddness in how skb_postpull_rcsum() gets used. Indeed looks like we should pull before "postpull", otherwise the CHECKSUM_PARTIAL fixup from skb_postpull_rcsum() will not be able to do its job: if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; Reported-by: Anand Parthasarathy Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Signed-off-by: Jakub Kicinski Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221220004701.402165-1-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 929358677183..43cc1fe58a2c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3180,15 +3180,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { + void *old_data; + /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; - skb_postpull_rcsum(skb, skb->data + off, len); - memmove(skb->data + len, skb->data, off); + old_data = skb->data; __skb_pull(skb, len); + skb_postpull_rcsum(skb, old_data + off, len); + memmove(skb->data, old_data, off); return 0; } -- cgit From b5f96cb719d8ba220b565ddd3ba4ac0d8bcfb130 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 13 Dec 2022 09:58:07 +0100 Subject: nvme-pci: fix doorbell buffer value endianness When using shadow doorbells, the event index and the doorbell values are written to host memory. Prior to this patch, the values written would erroneously be written in host endianness. This causes trouble on big-endian platforms. Fix this by adding missing endian conversions. This issue was noticed by Guenter while testing various big-endian platforms under QEMU[1]. A similar fix required for hw/nvme in QEMU is up for review as well[2]. [1]: https://lore.kernel.org/qemu-devel/20221209110022.GA3396194@roeck-us.net/ [2]: https://lore.kernel.org/qemu-devel/20221212114409.34972-4-its@irrelevant.dk/ Fixes: f9f38e33389c ("nvme: improve performance for virtual NVMe devices") Reported-by: Guenter Roeck Signed-off-by: Klaus Jensen Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f0f8027644bb..017442858054 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -144,9 +144,9 @@ struct nvme_dev { mempool_t *iod_mempool; /* shadow doorbell buffer support: */ - u32 *dbbuf_dbs; + __le32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; - u32 *dbbuf_eis; + __le32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; /* host memory buffer support: */ @@ -208,10 +208,10 @@ struct nvme_queue { #define NVMEQ_SQ_CMB 1 #define NVMEQ_DELETE_ERROR 2 #define NVMEQ_POLLED 3 - u32 *dbbuf_sq_db; - u32 *dbbuf_cq_db; - u32 *dbbuf_sq_ei; - u32 *dbbuf_cq_ei; + __le32 *dbbuf_sq_db; + __le32 *dbbuf_cq_db; + __le32 *dbbuf_sq_ei; + __le32 *dbbuf_cq_ei; struct completion delete_done; }; @@ -343,11 +343,11 @@ static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) } /* Update dbbuf and return true if an MMIO is required */ -static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, - volatile u32 *dbbuf_ei) +static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, + volatile __le32 *dbbuf_ei) { if (dbbuf_db) { - u16 old_value; + u16 old_value, event_idx; /* * Ensure that the queue is written before updating @@ -355,8 +355,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ wmb(); - old_value = *dbbuf_db; - *dbbuf_db = value; + old_value = le32_to_cpu(*dbbuf_db); + *dbbuf_db = cpu_to_le32(value); /* * Ensure that the doorbell is updated before reading the event @@ -366,7 +366,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ mb(); - if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) + event_idx = le32_to_cpu(*dbbuf_ei); + if (!nvme_dbbuf_need_event(event_idx, value, old_value)) return false; } -- cgit From c89a529e823d51dd23c7ec0c047c7a454a428541 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Dec 2022 10:59:06 -0800 Subject: nvme-pci: fix mempool alloc size Convert the max size to bytes to match the units of the divisor that calculates the worst-case number of PRP entries. The result is used to determine how many PRP Lists are required. The code was previously rounding this to 1 list, but we can require 2 in the worst case. In that scenario, the driver would corrupt memory beyond the size provided by the mempool. While unlikely to occur (you'd need a 4MB in exactly 127 phys segments on a queue that doesn't support SGLs), this memory corruption has been observed by kfence. Cc: Jens Axboe Fixes: 943e942e6266f ("nvme-pci: limit max IO size and segments to avoid high order allocations") Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 017442858054..6e9d1c7409a9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -381,8 +381,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, */ static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE); + unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; + unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } -- cgit From 841734234a28fd5cd0889b84bd4d93a0988fa11e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Dec 2022 13:54:55 -0800 Subject: nvme-pci: fix page size checks The size allocated out of the dma pool is at most NVME_CTRL_PAGE_SIZE, which may be smaller than the PAGE_SIZE. Fixes: c61b82c7b7134 ("nvme-pci: fix PRP pool size") Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6e9d1c7409a9..804b6a6cb43a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -36,7 +36,7 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) /* * These can be higher, but we need to ensure that any command doesn't @@ -383,7 +383,7 @@ static int nvme_pci_npages_prp(void) { unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } /* @@ -393,7 +393,7 @@ static int nvme_pci_npages_prp(void) static int nvme_pci_npages_sgl(void) { return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), - PAGE_SIZE); + NVME_CTRL_PAGE_SIZE); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -709,7 +709,7 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->length = cpu_to_le32(entries * sizeof(*sge)); sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } else { - sge->length = cpu_to_le32(PAGE_SIZE); + sge->length = cpu_to_le32(NVME_CTRL_PAGE_SIZE); sge->type = NVME_SGL_FMT_SEG_DESC << 4; } } -- cgit From 4217c6ac817451d5116687f3cc6286220dc43d49 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 19 Dec 2022 14:01:30 +0000 Subject: drm/panfrost: Fix GEM handle creation ref-counting panfrost_gem_create_with_handle() previously returned a BO but with the only reference being from the handle, which user space could in theory guess and release, causing a use-after-free. Additionally if the call to panfrost_gem_mapping_get() in panfrost_ioctl_create_bo() failed then a(nother) reference on the BO was dropped. The _create_with_handle() is a problematic pattern, so ditch it and instead create the handle in panfrost_ioctl_create_bo(). If the call to panfrost_gem_mapping_get() fails then this means that user space has indeed gone behind our back and freed the handle. In which case just return an error code. Reported-by: Rob Clark Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver") Signed-off-by: Steven Price Reviewed-by: Rob Clark Link: https://patchwork.freedesktop.org/patch/msgid/20221219140130.410578-1-steven.price@arm.com --- drivers/gpu/drm/panfrost/panfrost_drv.c | 27 ++++++++++++++++++--------- drivers/gpu/drm/panfrost/panfrost_gem.c | 16 +--------------- drivers/gpu/drm/panfrost/panfrost_gem.h | 5 +---- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 2fa5afe21288..919e6cc04982 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -82,6 +82,7 @@ static int panfrost_ioctl_create_bo(struct drm_device *dev, void *data, struct panfrost_gem_object *bo; struct drm_panfrost_create_bo *args = data; struct panfrost_gem_mapping *mapping; + int ret; if (!args->size || args->pad || (args->flags & ~(PANFROST_BO_NOEXEC | PANFROST_BO_HEAP))) @@ -92,21 +93,29 @@ static int panfrost_ioctl_create_bo(struct drm_device *dev, void *data, !(args->flags & PANFROST_BO_NOEXEC)) return -EINVAL; - bo = panfrost_gem_create_with_handle(file, dev, args->size, args->flags, - &args->handle); + bo = panfrost_gem_create(dev, args->size, args->flags); if (IS_ERR(bo)) return PTR_ERR(bo); + ret = drm_gem_handle_create(file, &bo->base.base, &args->handle); + if (ret) + goto out; + mapping = panfrost_gem_mapping_get(bo, priv); - if (!mapping) { - drm_gem_object_put(&bo->base.base); - return -EINVAL; + if (mapping) { + args->offset = mapping->mmnode.start << PAGE_SHIFT; + panfrost_gem_mapping_put(mapping); + } else { + /* This can only happen if the handle from + * drm_gem_handle_create() has already been guessed and freed + * by user space + */ + ret = -EINVAL; } - args->offset = mapping->mmnode.start << PAGE_SHIFT; - panfrost_gem_mapping_put(mapping); - - return 0; +out: + drm_gem_object_put(&bo->base.base); + return ret; } /** diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index 293e799e2fe8..3c812fbd126f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -235,12 +235,8 @@ struct drm_gem_object *panfrost_gem_create_object(struct drm_device *dev, size_t } struct panfrost_gem_object * -panfrost_gem_create_with_handle(struct drm_file *file_priv, - struct drm_device *dev, size_t size, - u32 flags, - uint32_t *handle) +panfrost_gem_create(struct drm_device *dev, size_t size, u32 flags) { - int ret; struct drm_gem_shmem_object *shmem; struct panfrost_gem_object *bo; @@ -256,16 +252,6 @@ panfrost_gem_create_with_handle(struct drm_file *file_priv, bo->noexec = !!(flags & PANFROST_BO_NOEXEC); bo->is_heap = !!(flags & PANFROST_BO_HEAP); - /* - * Allocate an id of idr table where the obj is registered - * and handle has the id what user can see. - */ - ret = drm_gem_handle_create(file_priv, &shmem->base, handle); - /* drop reference from allocate - handle holds it now. */ - drm_gem_object_put(&shmem->base); - if (ret) - return ERR_PTR(ret); - return bo; } diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.h b/drivers/gpu/drm/panfrost/panfrost_gem.h index 8088d5fd8480..ad2877eeeccd 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.h +++ b/drivers/gpu/drm/panfrost/panfrost_gem.h @@ -69,10 +69,7 @@ panfrost_gem_prime_import_sg_table(struct drm_device *dev, struct sg_table *sgt); struct panfrost_gem_object * -panfrost_gem_create_with_handle(struct drm_file *file_priv, - struct drm_device *dev, size_t size, - u32 flags, - uint32_t *handle); +panfrost_gem_create(struct drm_device *dev, size_t size, u32 flags); int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv); void panfrost_gem_close(struct drm_gem_object *obj, -- cgit From 52ea806ad983490b3132a9e526e11a10dc2fd10c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Dec 2022 07:05:09 -0700 Subject: io_uring: finish waiting before flushing overflow entries If we have overflow entries being generated after we've done the initial flush in io_cqring_wait(), then we could be flushing them in the main wait loop as well. If that's done after having added ourselves to the cq_wait waitqueue, then the task state can be != TASK_RUNNING when we enter the overflow flush. Check for the need to overflow flush, and finish our wait cycle first if we have to do so. Reported-and-tested-by: syzbot+cf6ea1d6bb30a4ce10b2@syzkaller.appspotmail.com Link: https://lore.kernel.org/io-uring/000000000000cb143a05f04eee15@google.com/ Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ff2bbac1a10f..ac5d39eeb3d1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -677,16 +677,20 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } +static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +{ + /* iopoll syncs against uring_lock, not completion_lock */ + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx); + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); +} + static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) + io_cqring_do_overflow_flush(ctx); } void __io_put_task(struct task_struct *task, int nr) @@ -2549,7 +2553,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { - io_cqring_overflow_flush(ctx); + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { + finish_wait(&ctx->cq_wait, &iowq.wq); + io_cqring_do_overflow_flush(ctx); + } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, timeout); -- cgit From 5eb119da94ac5d67a31eaa869621dc6e25eb125e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 15 Dec 2022 15:16:33 +0100 Subject: netfilter: conntrack: fix ipv6 exthdr error check smatch warnings: net/netfilter/nf_conntrack_proto.c:167 nf_confirm() warn: unsigned 'protoff' is never less than zero. We need to check if ipv6_skip_exthdr() returned an error, but protoff is unsigned. Use a signed integer for this. Fixes: a70e483460d5 ("netfilter: conntrack: merge ipv4+ipv6 confirm functions") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 99323fb12d0f..ccef340be575 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -141,6 +141,7 @@ unsigned int nf_confirm(void *priv, struct nf_conn *ct; bool seqadj_needed; __be16 frag_off; + int start; u8 pnum; ct = nf_ct_get(skb, &ctinfo); @@ -163,9 +164,11 @@ unsigned int nf_confirm(void *priv, break; case NFPROTO_IPV6: pnum = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); + if (start < 0 || (frag_off & htons(~0x7)) != 0) return nf_conntrack_confirm(skb); + + protoff = start; break; default: return nf_conntrack_confirm(skb); -- cgit From bed4a63ea4ae77cfe5aae004ef87379f0655260a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 20:07:52 +0100 Subject: netfilter: nf_tables: consolidate set description Add the following fields to the set description: - key type - data type - object type - policy - gc_int: garbage collection interval) - timeout: element timeout This prepares for stricter set type checks on updates in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 12 ++++++++ net/netfilter/nf_tables_api.c | 58 +++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e69ce23566ea..4957b4775757 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -312,17 +312,29 @@ struct nft_set_iter { /** * struct nft_set_desc - description of set elements * + * @ktype: key type * @klen: key length + * @dtype: data type * @dlen: data length + * @objtype: object type + * @flags: flags * @size: number of set elements + * @policy: set policy + * @gc_int: garbage collector interval * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { + u32 ktype; unsigned int klen; + u32 dtype; unsigned int dlen; + u32 objtype; unsigned int size; + u32 policy; + u32 gc_int; + u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 832b881f7c17..1deecc1a6c00 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3780,8 +3780,7 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) static const struct nft_set_ops * nft_select_set_ops(const struct nft_ctx *ctx, const struct nlattr * const nla[], - const struct nft_set_desc *desc, - enum nft_set_policies policy) + const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; @@ -3810,7 +3809,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!ops->estimate(desc, flags, &est)) continue; - switch (policy) { + switch (desc->policy) { case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; @@ -4392,7 +4391,6 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - u32 ktype, dtype, flags, policy, gc_int, objtype; struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; @@ -4405,10 +4403,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; - u64 timeout; char *name; int err, i; u16 udlen; + u32 flags; u64 size; if (nla[NFTA_SET_TABLE] == NULL || @@ -4419,10 +4417,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, memset(&desc, 0, sizeof(desc)); - ktype = NFT_DATA_VALUE; + desc.ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { - ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); - if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) + desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); + if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) return -EINVAL; } @@ -4447,17 +4445,17 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, return -EOPNOTSUPP; } - dtype = 0; + desc.dtype = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; - dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); - if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && - dtype != NFT_DATA_VERDICT) + desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); + if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && + desc.dtype != NFT_DATA_VERDICT) return -EINVAL; - if (dtype != NFT_DATA_VERDICT) { + if (desc.dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); @@ -4472,34 +4470,34 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_OBJECT)) return -EINVAL; - objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); - if (objtype == NFT_OBJECT_UNSPEC || - objtype > NFT_OBJECT_MAX) + desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); + if (desc.objtype == NFT_OBJECT_UNSPEC || + desc.objtype > NFT_OBJECT_MAX) return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else - objtype = NFT_OBJECT_UNSPEC; + desc.objtype = NFT_OBJECT_UNSPEC; - timeout = 0; + desc.timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; } - gc_int = 0; + desc.gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } - policy = NFT_SET_POL_PERFORMANCE; + desc.policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) - policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); @@ -4544,7 +4542,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc, policy); + ops = nft_select_set_ops(&ctx, nla, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -4584,18 +4582,18 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, set->table = table; write_pnet(&set->net, net); set->ops = ops; - set->ktype = ktype; + set->ktype = desc.ktype; set->klen = desc.klen; - set->dtype = dtype; - set->objtype = objtype; + set->dtype = desc.dtype; + set->objtype = desc.objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; - set->policy = policy; + set->policy = desc.policy; set->udlen = udlen; set->udata = udata; - set->timeout = timeout; - set->gc_int = gc_int; + set->timeout = desc.timeout; + set->gc_int = desc.gc_int; set->field_count = desc.field_count; for (i = 0; i < desc.field_count; i++) -- cgit From a8fe4154fa5a1bae590b243ed60f871e5a5e1378 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 18:00:10 +0100 Subject: netfilter: nf_tables: add function to create set stateful expressions Add a helper function to allocate and initialize the stateful expressions that are defined in a set. This patch allows to reuse this code from the set update path, to check that type of the update matches the existing set in the kernel. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 106 +++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 38 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1deecc1a6c00..b9b0ae29f5f6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4388,6 +4388,59 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, return err; } +static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set, + const struct nlattr * const *nla, + struct nft_expr **exprs, int *num_exprs, + u32 flags) +{ + struct nft_expr *expr; + int err, i; + + if (nla[NFTA_SET_EXPR]) { + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[0] = expr; + (*num_exprs)++; + } else if (nla[NFTA_SET_EXPRESSIONS]) { + struct nlattr *tmp; + int left; + + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_expr_alloc; + } + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_set_expr_alloc; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_expr_alloc; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_expr_alloc; + } + exprs[i++] = expr; + (*num_exprs)++; + } + } + + return 0; + +err_set_expr_alloc: + for (i = 0; i < *num_exprs; i++) + nft_expr_destroy(ctx, exprs[i]); + + return err; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -4395,7 +4448,6 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; - struct nft_expr *expr = NULL; struct net *net = info->net; struct nft_set_desc desc; struct nft_table *table; @@ -4403,6 +4455,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; + int num_exprs = 0; char *name; int err, i; u16 udlen; @@ -4529,6 +4582,8 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, return PTR_ERR(set); } } else { + struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {}; + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; @@ -4536,6 +4591,13 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); + if (err < 0) + return err; + + for (i = 0; i < num_exprs; i++) + nft_expr_destroy(&ctx, exprs[i]); + return 0; } @@ -4603,43 +4665,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) goto err_set_init; - if (nla[NFTA_SET_EXPR]) { - expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_expr_alloc; - } - set->exprs[0] = expr; - set->num_exprs++; - } else if (nla[NFTA_SET_EXPRESSIONS]) { - struct nft_expr *expr; - struct nlattr *tmp; - int left; - - if (!(flags & NFT_SET_EXPR)) { - err = -EINVAL; - goto err_set_expr_alloc; - } - i = 0; - nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { - if (i == NFT_SET_EXPR_MAX) { - err = -E2BIG; - goto err_set_expr_alloc; - } - if (nla_type(tmp) != NFTA_LIST_ELEM) { - err = -EINVAL; - goto err_set_expr_alloc; - } - expr = nft_set_elem_expr_alloc(&ctx, set, tmp); - if (IS_ERR(expr)) { - err = PTR_ERR(expr); - goto err_set_expr_alloc; - } - set->exprs[i++] = expr; - set->num_exprs++; - } - } + err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags); + if (err < 0) + goto err_set_destroy; + set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); @@ -4653,7 +4683,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); - +err_set_destroy: ops->destroy(set); err_set_init: kfree(set->name); -- cgit From f6594c372afd5cec8b1e9ee9ea8f8819d59c6fb1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 20:09:00 +0100 Subject: netfilter: nf_tables: perform type checking for existing sets If a ruleset declares a set name that matches an existing set in the kernel, then validate that this declaration really refers to the same set, otherwise bail out with EEXIST. Currently, the kernel reports success when adding a set that already exists in the kernel. This usually results in EINVAL errors at a later stage, when the user adds elements to the set, if the set declaration mismatches the existing set representation in the kernel. Add a new function to check that the set declaration really refers to the same existing set in the kernel. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b9b0ae29f5f6..319887f4d3ef 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4441,6 +4441,34 @@ err_set_expr_alloc: return err; } +static bool nft_set_is_same(const struct nft_set *set, + const struct nft_set_desc *desc, + struct nft_expr *exprs[], u32 num_exprs, u32 flags) +{ + int i; + + if (set->ktype != desc->ktype || + set->dtype != desc->dtype || + set->flags != flags || + set->klen != desc->klen || + set->dlen != desc->dlen || + set->field_count != desc->field_count || + set->num_exprs != num_exprs) + return false; + + for (i = 0; i < desc->field_count; i++) { + if (set->field_len[i] != desc->field_len[i]) + return false; + } + + for (i = 0; i < num_exprs; i++) { + if (set->exprs[i]->ops != exprs[i]->ops) + return false; + } + + return true; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -4595,10 +4623,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; + err = 0; + if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); + err = -EEXIST; + } + for (i = 0; i < num_exprs; i++) nft_expr_destroy(&ctx, exprs[i]); - return 0; + return err; } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) -- cgit From 23fffb2f09ce1145cbd751801d45ba74acaa6542 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Dec 2022 07:11:33 -0700 Subject: io_uring/cancel: re-grab ctx mutex after finishing wait If we have a signal pending during cancelations, it'll cause the task_work run to return an error. Since we didn't run task_work, the current task is left in TASK_INTERRUPTIBLE state when we need to re-grab the ctx mutex, and the kernel will rightfully complain about that. Move the lock grabbing for the error cases outside the loop to avoid that issue. Reported-by: syzbot+7df055631cd1be4586fd@syzkaller.appspotmail.com Link: https://lore.kernel.org/io-uring/0000000000003a14a905f05050b0@google.com/ Signed-off-by: Jens Axboe --- io_uring/cancel.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 2291a53cdabd..b4f5dfacc0c3 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -288,24 +288,23 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); + mutex_unlock(&ctx->uring_lock); if (ret != -EALREADY) break; - mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(ctx); - if (ret < 0) { - mutex_lock(&ctx->uring_lock); + if (ret < 0) break; - } ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); - mutex_lock(&ctx->uring_lock); if (!ret) { ret = -ETIME; break; } + mutex_lock(&ctx->uring_lock); } while (1); finish_wait(&ctx->cq_wait, &wait); + mutex_lock(&ctx->uring_lock); if (ret == -ENOENT || ret > 0) ret = 0; -- cgit From 70a00e2f1dbae11dc3444444c6bd7555763d8421 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 21 Dec 2022 10:56:53 -0800 Subject: selftests/bpf: Test bpf_skb_adjust_room on CHECKSUM_PARTIAL When the bpf_skb_adjust_room() shrinks the skb such that its csum_start is invalid, the skb->ip_summed should be reset from CHECKSUM_PARTIAL to CHECKSUM_NONE. The commit 54c3f1a81421 ("bpf: pull before calling skb_postpull_rcsum()") fixed it. This patch adds a test to ensure the skb->ip_summed changed from CHECKSUM_PARTIAL to CHECKSUM_NONE after bpf_skb_adjust_room(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221221185653.1589961-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../selftests/bpf/prog_tests/decap_sanity.c | 85 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/bpf_tracing_net.h | 6 ++ tools/testing/selftests/bpf/progs/decap_sanity.c | 68 +++++++++++++++++ 4 files changed, 160 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/decap_sanity.c create mode 100644 tools/testing/selftests/bpf/progs/decap_sanity.c diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 585fcf73c731..3fc3e54b19aa 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -14,6 +14,7 @@ cgrp_kfunc # JIT does not support calling kernel f cgrp_local_storage # prog_attach unexpected error: -524 (trampoline) core_read_macros # unknown func bpf_probe_read#4 (overlapping) d_path # failed to auto-attach program 'prog_stat': -524 (trampoline) +decap_sanity # JIT does not support calling kernel function (kfunc) deny_namespace # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) dummy_st_ops # test_run unexpected error: -524 (errno 524) (trampoline) fentry_fexit # fentry attach failed: -524 (trampoline) diff --git a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c new file mode 100644 index 000000000000..0b2f73b88c53 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include + +#include "test_progs.h" +#include "network_helpers.h" +#include "decap_sanity.skel.h" + +#define SYS(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (!ASSERT_OK(system(cmd), cmd)) \ + goto fail; \ + }) + +#define NS_TEST "decap_sanity_ns" +#define IPV6_IFACE_ADDR "face::1" +#define UDP_TEST_PORT 7777 + +void test_decap_sanity(void) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach); + struct nstoken *nstoken = NULL; + struct decap_sanity *skel; + struct sockaddr_in6 addr; + socklen_t addrlen; + char buf[128] = {}; + int sockfd, err; + + skel = decap_sanity__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open_and_load")) + return; + + SYS("ip netns add %s", NS_TEST); + SYS("ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR); + SYS("ip -net %s link set dev lo up", NS_TEST); + + nstoken = open_netns(NS_TEST); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto fail; + + qdisc_hook.ifindex = if_nametoindex("lo"); + if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo")) + goto fail; + + err = bpf_tc_hook_create(&qdisc_hook); + if (!ASSERT_OK(err, "create qdisc hook")) + goto fail; + + tc_attach.prog_fd = bpf_program__fd(skel->progs.decap_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach); + if (!ASSERT_OK(err, "attach filter")) + goto fail; + + addrlen = sizeof(addr); + err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, UDP_TEST_PORT, + (void *)&addr, &addrlen); + if (!ASSERT_OK(err, "make_sockaddr")) + goto fail; + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "socket")) + goto fail; + err = sendto(sockfd, buf, sizeof(buf), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(buf), "send")) + goto fail; + + ASSERT_TRUE(skel->bss->init_csum_partial, "init_csum_partial"); + ASSERT_TRUE(skel->bss->final_csum_none, "final_csum_none"); + ASSERT_FALSE(skel->bss->broken_csum_start, "broken_csum_start"); + +fail: + if (nstoken) { + bpf_tc_hook_destroy(&qdisc_hook); + close_netns(nstoken); + } + system("ip netns del " NS_TEST " >& /dev/null"); + decap_sanity__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index b394817126cf..cfed4df490f3 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -50,6 +50,12 @@ #define ICSK_TIME_LOSS_PROBE 5 #define ICSK_TIME_REO_TIMEOUT 6 +#define ETH_HLEN 14 +#define ETH_P_IPV6 0x86DD + +#define CHECKSUM_NONE 0 +#define CHECKSUM_PARTIAL 3 + #define IFNAMSIZ 16 #define RTF_GATEWAY 0x0002 diff --git a/tools/testing/selftests/bpf/progs/decap_sanity.c b/tools/testing/selftests/bpf/progs/decap_sanity.c new file mode 100644 index 000000000000..bd3c657c58a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/decap_sanity.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include + +#define UDP_TEST_PORT 7777 + +void *bpf_cast_to_kern_ctx(void *) __ksym; +bool init_csum_partial = false; +bool final_csum_none = false; +bool broken_csum_start = false; + +static unsigned int skb_headlen(const struct sk_buff *skb) +{ + return skb->len - skb->data_len; +} + +static unsigned int skb_headroom(const struct sk_buff *skb) +{ + return skb->data - skb->head; +} + +static int skb_checksum_start_offset(const struct sk_buff *skb) +{ + return skb->csum_start - skb_headroom(skb); +} + +SEC("tc") +int decap_sanity(struct __sk_buff *skb) +{ + struct sk_buff *kskb; + struct ipv6hdr ip6h; + struct udphdr udph; + int err; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6)) + return TC_ACT_SHOT; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h))) + return TC_ACT_SHOT; + + if (ip6h.nexthdr != IPPROTO_UDP) + return TC_ACT_SHOT; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph))) + return TC_ACT_SHOT; + + if (udph.dest != __bpf_constant_htons(UDP_TEST_PORT)) + return TC_ACT_SHOT; + + kskb = bpf_cast_to_kern_ctx(skb); + init_csum_partial = (kskb->ip_summed == CHECKSUM_PARTIAL); + err = bpf_skb_adjust_room(skb, -(s32)(ETH_HLEN + sizeof(ip6h) + sizeof(udph)), + 1, BPF_F_ADJ_ROOM_FIXED_GSO); + if (err) + return TC_ACT_SHOT; + final_csum_none = (kskb->ip_summed == CHECKSUM_NONE); + if (kskb->ip_summed == CHECKSUM_PARTIAL && + (unsigned int)skb_checksum_start_offset(kskb) >= skb_headlen(kskb)) + broken_csum_start = true; + + return TC_ACT_SHOT; +} + +char __license[] SEC("license") = "GPL"; -- cgit From 53fc61be273a1e76dd5e356f91805dce00ff2d2c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 20 Dec 2022 09:54:48 -0800 Subject: ice: xsk: do not use xdp_return_frame() on tx_buf->raw_buf Previously ice XDP xmit routine was changed in a way that it avoids xdp_buff->xdp_frame conversion as it is simply not needed for handling XDP_TX action and what is more it saves us CPU cycles. This routine is re-used on ZC driver to handle XDP_TX action. Although for XDP_TX on Rx ZC xdp_buff that comes from xsk_buff_pool is converted to xdp_frame, xdp_frame itself is not stored inside ice_tx_buf, we only store raw data pointer. Casting this pointer to xdp_frame and calling against it xdp_return_frame in ice_clean_xdp_tx_buf() results in undefined behavior. To fix this, simply call page_frag_free() on tx_buf->raw_buf. Later intention is to remove the buff->frame conversion in order to simplify the codebase and improve XDP_TX performance on ZC. Fixes: 126cdfe1007a ("ice: xsk: Improve AF_XDP ZC Tx and use batching API") Reported-and-tested-by: Robin Cowley Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen Reviewed-by: Piotr Raczynski Link: https://lore.kernel.org/r/20221220175448.693999-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 907055b77af0..7105de6fb344 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -783,7 +783,7 @@ construct_skb: static void ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf) { - xdp_return_frame((struct xdp_frame *)tx_buf->raw_buf); + page_frag_free(tx_buf->raw_buf); xdp_ring->xdp_tx_active--; dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); -- cgit From f2575c8f404911da83f25b688e12afcf4273e640 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 20 Dec 2022 18:18:25 +0100 Subject: net: vrf: determine the dst using the original ifindex for multicast Multicast packets received on an interface bound to a VRF are marked as belonging to the VRF and the skb device is updated to point to the VRF device itself. This was fine even when a route was associated to a device as when performing a fib table lookup 'oif' in fib6_table_lookup (coming from 'skb->dev->ifindex' in ip6_route_input) was set to 0 when FLOWI_FLAG_SKIP_NH_OIF was set. With commit 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices") this is not longer true and multicast traffic is not received on the original interface. Instead of adding back a similar check in fib6_table_lookup determine the dst using the original ifindex for multicast VRF traffic. To make things consistent across the function do the above for all strict packets, which was the logic before commit 6f12fa775530 ("vrf: mark skb for multicast or link-local as enslaved to VRF"). Note that reverting to this behavior should be fine as the change was about marking packets belonging to the VRF, not about their dst. Fixes: 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices") Reported-by: Jianlin Shi Signed-off-by: Antoine Tenart Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20221220171825.1172237-1-atenart@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 6b5a4d036d15..bdb3a76a352e 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1385,8 +1385,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. - * For strict packets with a source LLA, determine the dst using the - * original ifindex. + * For non-loopback strict packets, determine the dst using the original + * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; @@ -1395,7 +1395,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; - else if (ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) + else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; -- cgit From 95637d91fefdb94d6e7389222ba9ddab0e9f5abe Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Tue, 20 Dec 2022 16:27:17 -0500 Subject: net: openvswitch: release vport resources on failure A recent commit introducing upcall packet accounting failed to properly release the vport object when the per-cpu stats struct couldn't be allocated. This can cause dangling pointers to dp objects long after they've been released. Cc: wangchuanlei Fixes: 1933ea365aa7 ("net: openvswitch: Add support to count upcall packets") Reported-by: syzbot+8f4e2dcfcb3209ac35f9@syzkaller.appspotmail.com Signed-off-by: Aaron Conole Acked-by: Eelco Chaudron Reviewed-by: Michal Swiatkowski Link: https://lore.kernel.org/r/20221220212717.526780-1-aconole@redhat.com Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9ca721c9fa71..a71795355aec 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1861,7 +1861,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; - goto err_destroy_portids; + goto err_destroy_vport; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1876,6 +1876,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; +err_destroy_vport: + ovs_dp_detach_port(vport); err_destroy_portids: kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: @@ -2323,7 +2325,7 @@ restart: vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); if (!vport->upcall_stats) { err = -ENOMEM; - goto exit_unlock_free; + goto exit_unlock_free_vport; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), @@ -2343,6 +2345,8 @@ restart: ovs_notify(&dp_vport_genl_family, reply, info); return 0; +exit_unlock_free_vport: + ovs_dp_detach_port(vport); exit_unlock_free: ovs_unlock(); kfree_skb(reply); -- cgit From 3d8f2c4269d08f8793e946279dbdf5e972cc4911 Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Tue, 20 Dec 2022 12:25:55 -0800 Subject: vmxnet3: correctly report csum_level for encapsulated packet Commit dacce2be3312 ("vmxnet3: add geneve and vxlan tunnel offload support") added support for encapsulation offload. However, the pathc did not report correctly the csum_level for encapsulated packet. This patch fixes this issue by reporting correct csum level for the encapsulated packet. Fixes: dacce2be3312 ("vmxnet3: add geneve and vxlan tunnel offload support") Signed-off-by: Ronak Doshi Acked-by: Peng Li Link: https://lore.kernel.org/r/20221220202556.24421-1-doshir@vmware.com Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 6f1e560fb15c..56267c327f0b 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1288,6 +1288,10 @@ vmxnet3_rx_csum(struct vmxnet3_adapter *adapter, (le32_to_cpu(gdesc->dword[3]) & VMXNET3_RCD_CSUM_OK) == VMXNET3_RCD_CSUM_OK) { skb->ip_summed = CHECKSUM_UNNECESSARY; + if ((le32_to_cpu(gdesc->dword[0]) & + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))) { + skb->csum_level = 1; + } WARN_ON_ONCE(!(gdesc->rcd.tcp || gdesc->rcd.udp) && !(le32_to_cpu(gdesc->dword[0]) & (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); @@ -1297,6 +1301,10 @@ vmxnet3_rx_csum(struct vmxnet3_adapter *adapter, } else if (gdesc->rcd.v6 && (le32_to_cpu(gdesc->dword[3]) & (1 << VMXNET3_RCD_TUC_SHIFT))) { skb->ip_summed = CHECKSUM_UNNECESSARY; + if ((le32_to_cpu(gdesc->dword[0]) & + (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))) { + skb->csum_level = 1; + } WARN_ON_ONCE(!(gdesc->rcd.tcp || gdesc->rcd.udp) && !(le32_to_cpu(gdesc->dword[0]) & (1UL << VMXNET3_RCD_HDR_INNER_SHIFT))); -- cgit From e20aa071cd955aabc15be0ec1e914283592ddef4 Mon Sep 17 00:00:00 2001 From: Yinjun Zhang Date: Tue, 20 Dec 2022 16:21:00 +0100 Subject: nfp: fix schedule in atomic context when sync mc address The callback `.ndo_set_rx_mode` is called in atomic context, sleep is not allowed in the implementation. Now use workqueue mechanism to avoid this issue. Fixes: de6248644966 ("nfp: add support for multicast filter") Signed-off-by: Yinjun Zhang Reviewed-by: Louis Peens Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20221220152100.1042774-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_net.h | 7 +++ .../net/ethernet/netronome/nfp/nfp_net_common.c | 61 ++++++++++++++++++++-- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index da33f09facb9..432d79d691c2 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -617,6 +617,9 @@ struct nfp_net_dp { * @vnic_no_name: For non-port PF vNIC make ndo_get_phys_port_name return * -EOPNOTSUPP to keep backwards compatibility (set by app) * @port: Pointer to nfp_port structure if vNIC is a port + * @mc_lock: Protect mc_addrs list + * @mc_addrs: List of mc addrs to add/del to HW + * @mc_work: Work to update mc addrs * @app_priv: APP private data for this vNIC */ struct nfp_net { @@ -718,6 +721,10 @@ struct nfp_net { struct nfp_port *port; + spinlock_t mc_lock; + struct list_head mc_addrs; + struct work_struct mc_work; + void *app_priv; }; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 09053373288f..18fc9971f1c8 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1334,9 +1334,14 @@ err_unlock: return err; } -static int nfp_net_mc_cfg(struct net_device *netdev, const unsigned char *addr, const u32 cmd) +struct nfp_mc_addr_entry { + u8 addr[ETH_ALEN]; + u32 cmd; + struct list_head list; +}; + +static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) { - struct nfp_net *nn = netdev_priv(netdev); int ret; ret = nfp_net_mbox_lock(nn, NFP_NET_CFG_MULTICAST_SZ); @@ -1351,6 +1356,25 @@ static int nfp_net_mc_cfg(struct net_device *netdev, const unsigned char *addr, return nfp_net_mbox_reconfig_and_unlock(nn, cmd); } +static int nfp_net_mc_prep(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) +{ + struct nfp_mc_addr_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + ether_addr_copy(entry->addr, addr); + entry->cmd = cmd; + spin_lock_bh(&nn->mc_lock); + list_add_tail(&entry->list, &nn->mc_addrs); + spin_unlock_bh(&nn->mc_lock); + + schedule_work(&nn->mc_work); + + return 0; +} + static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) { struct nfp_net *nn = netdev_priv(netdev); @@ -1361,12 +1385,35 @@ static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) return -EINVAL; } - return nfp_net_mc_cfg(netdev, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD); + return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD); } static int nfp_net_mc_unsync(struct net_device *netdev, const unsigned char *addr) { - return nfp_net_mc_cfg(netdev, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL); + struct nfp_net *nn = netdev_priv(netdev); + + return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL); +} + +static void nfp_net_mc_addr_config(struct work_struct *work) +{ + struct nfp_net *nn = container_of(work, struct nfp_net, mc_work); + struct nfp_mc_addr_entry *entry, *tmp; + struct list_head tmp_list; + + INIT_LIST_HEAD(&tmp_list); + + spin_lock_bh(&nn->mc_lock); + list_splice_init(&nn->mc_addrs, &tmp_list); + spin_unlock_bh(&nn->mc_lock); + + list_for_each_entry_safe(entry, tmp, &tmp_list, list) { + if (nfp_net_mc_cfg(nn, entry->addr, entry->cmd)) + nn_err(nn, "Config mc address to HW failed.\n"); + + list_del(&entry->list); + kfree(entry); + } } static void nfp_net_set_rx_mode(struct net_device *netdev) @@ -2633,6 +2680,11 @@ int nfp_net_init(struct nfp_net *nn) if (!nn->dp.netdev) return 0; + + spin_lock_init(&nn->mc_lock); + INIT_LIST_HEAD(&nn->mc_addrs); + INIT_WORK(&nn->mc_work, nfp_net_mc_addr_config); + return register_netdev(nn->dp.netdev); err_clean_mbox: @@ -2652,5 +2704,6 @@ void nfp_net_clean(struct nfp_net *nn) unregister_netdev(nn->dp.netdev); nfp_net_ipsec_clean(nn); nfp_ccm_mbox_clean(nn); + flush_work(&nn->mc_work); nfp_net_reconfig_wait_posted(nn); } -- cgit From 7d803344fdc3e38079fabcf38b1e4cb6f8faa655 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Dec 2022 11:52:14 -0800 Subject: mptcp: fix deadlock in fastopen error path MatM reported a deadlock at fastopening time: INFO: task syz-executor.0:11454 blocked for more than 143 seconds. Tainted: G S 6.1.0-rc5-03226-gdb0157db5153 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.0 state:D stack:25104 pid:11454 ppid:424 flags:0x00004006 Call Trace: context_switch kernel/sched/core.c:5191 [inline] __schedule+0x5c2/0x1550 kernel/sched/core.c:6503 schedule+0xe8/0x1c0 kernel/sched/core.c:6579 __lock_sock+0x142/0x260 net/core/sock.c:2896 lock_sock_nested+0xdb/0x100 net/core/sock.c:3466 __mptcp_close_ssk+0x1a3/0x790 net/mptcp/protocol.c:2328 mptcp_destroy_common+0x16a/0x650 net/mptcp/protocol.c:3171 mptcp_disconnect+0xb8/0x450 net/mptcp/protocol.c:3019 __inet_stream_connect+0x897/0xa40 net/ipv4/af_inet.c:720 tcp_sendmsg_fastopen+0x3dd/0x740 net/ipv4/tcp.c:1200 mptcp_sendmsg_fastopen net/mptcp/protocol.c:1682 [inline] mptcp_sendmsg+0x128a/0x1a50 net/mptcp/protocol.c:1721 inet6_sendmsg+0x11f/0x150 net/ipv6/af_inet6.c:663 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xf7/0x190 net/socket.c:734 ____sys_sendmsg+0x336/0x970 net/socket.c:2476 ___sys_sendmsg+0x122/0x1c0 net/socket.c:2530 __sys_sendmmsg+0x18d/0x460 net/socket.c:2616 __do_sys_sendmmsg net/socket.c:2645 [inline] __se_sys_sendmmsg net/socket.c:2642 [inline] __x64_sys_sendmmsg+0x9d/0x110 net/socket.c:2642 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5920a75e7d RSP: 002b:00007f59201e8028 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00007f5920bb4f80 RCX: 00007f5920a75e7d RDX: 0000000000000001 RSI: 0000000020002940 RDI: 0000000000000005 RBP: 00007f5920ae7593 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000020004050 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007f5920bb4f80 R15: 00007f59201c8000 In the error path, tcp_sendmsg_fastopen() ends-up calling mptcp_disconnect(), and the latter tries to close each subflow, acquiring the socket lock on each of them. At fastopen time, we have a single subflow, and such subflow socket lock is already held by the called, causing the deadlock. We already track the 'fastopen in progress' status inside the msk socket. Use it to address the issue, making mptcp_disconnect() a no op when invoked from the fastopen (error) path and doing the relevant cleanup after releasing the subflow socket lock. While at the above, rename the fastopen status bit to something more meaningful. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/321 Fixes: fa9e57468aa1 ("mptcp: fix abba deadlock on fastopen") Reported-by: Mat Martineau Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++++++++--- net/mptcp/protocol.h | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f6f93957275b..907b435e2984 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1662,6 +1662,8 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_disconnect(struct sock *sk, int flags); + static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, size_t len, int *copied_syn) { @@ -1672,9 +1674,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->connect_flags = O_NONBLOCK; - msk->is_sendmsg = 1; + msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); - msk->is_sendmsg = 0; + msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); @@ -1688,6 +1690,8 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; + } else if (ret && ret != -EINPROGRESS) { + mptcp_disconnect(sk, 0); } return ret; @@ -2989,6 +2993,14 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* We are on the fastopen error path. We can't call straight into the + * subflows cleanup code due to lock nesting (we are already under + * msk->firstsocket lock). Do nothing and leave the cleanup to the + * caller. + */ + if (msk->fastopening) + return 0; + inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3532,7 +3544,7 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ - if (msk->is_sendmsg) + if (msk->fastopening) err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); else err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 955fb3d88eb3..f47d3e4018b5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -295,7 +295,7 @@ struct mptcp_sock { u8 recvmsg_inq:1, cork:1, nodelay:1, - is_sendmsg:1; + fastopening:1; int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; -- cgit From fec3adfd754ccc99a7230e8ab9f105b65fb07bcc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Dec 2022 11:52:15 -0800 Subject: mptcp: fix lockdep false positive MattB reported a lockdep splat in the mptcp listener code cleanup: WARNING: possible circular locking dependency detected packetdrill/14278 is trying to acquire lock: ffff888017d868f0 ((work_completion)(&msk->work)){+.+.}-{0:0}, at: __flush_work (kernel/workqueue.c:3069) but task is already holding lock: ffff888017d84130 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close (net/mptcp/protocol.c:2973) which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_INET){+.+.}-{0:0}: __lock_acquire (kernel/locking/lockdep.c:5055) lock_acquire (kernel/locking/lockdep.c:466) lock_sock_nested (net/core/sock.c:3463) mptcp_worker (net/mptcp/protocol.c:2614) process_one_work (kernel/workqueue.c:2294) worker_thread (include/linux/list.h:292) kthread (kernel/kthread.c:376) ret_from_fork (arch/x86/entry/entry_64.S:312) -> #0 ((work_completion)(&msk->work)){+.+.}-{0:0}: check_prev_add (kernel/locking/lockdep.c:3098) validate_chain (kernel/locking/lockdep.c:3217) __lock_acquire (kernel/locking/lockdep.c:5055) lock_acquire (kernel/locking/lockdep.c:466) __flush_work (kernel/workqueue.c:3070) __cancel_work_timer (kernel/workqueue.c:3160) mptcp_cancel_work (net/mptcp/protocol.c:2758) mptcp_subflow_queue_clean (net/mptcp/subflow.c:1817) __mptcp_close_ssk (net/mptcp/protocol.c:2363) mptcp_destroy_common (net/mptcp/protocol.c:3170) mptcp_destroy (include/net/sock.h:1495) __mptcp_destroy_sock (net/mptcp/protocol.c:2886) __mptcp_close (net/mptcp/protocol.c:2959) mptcp_close (net/mptcp/protocol.c:2974) inet_release (net/ipv4/af_inet.c:432) __sock_release (net/socket.c:651) sock_close (net/socket.c:1367) __fput (fs/file_table.c:320) task_work_run (kernel/task_work.c:181 (discriminator 1)) exit_to_user_mode_prepare (include/linux/resume_user_mode.h:49) syscall_exit_to_user_mode (kernel/entry/common.c:130) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_INET); lock((work_completion)(&msk->work)); lock(sk_lock-AF_INET); lock((work_completion)(&msk->work)); *** DEADLOCK *** The report is actually a false positive, since the only existing lock nesting is the msk socket lock acquired by the mptcp work. cancel_work_sync() is invoked without the relevant socket lock being held, but under a different (the msk listener) socket lock. We could silence the splat adding a per workqueue dynamic lockdep key, but that looks overkill. Instead just tell lockdep the msk socket lock is not held around cancel_work_sync(). Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/322 Fixes: 30e51b923e43 ("mptcp: fix unreleased socket in accept queue") Reported-by: Matthieu Baerts Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 19 +++++++++++++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 907b435e2984..b7ad030dfe89 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2357,7 +2357,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* otherwise tcp will dispose of the ssk and subflow ctx */ if (ssk->sk_state == TCP_LISTEN) { tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(ssk); + mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f47d3e4018b5..a0d1658ce59e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -628,7 +628,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); -void mptcp_subflow_queue_clean(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d1d32a66ae3f..bd387d4b5a38 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1791,7 +1791,7 @@ static void subflow_state_change(struct sock *sk) } } -void mptcp_subflow_queue_clean(struct sock *listener_ssk) +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct mptcp_sock *msk, *next, *head = NULL; @@ -1840,8 +1840,23 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk) do_cancel_work = __mptcp_close(sk, 0); release_sock(sk); - if (do_cancel_work) + if (do_cancel_work) { + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, + SINGLE_DEPTH_NESTING, 0, _RET_IP_); + } sock_put(sk); } -- cgit From 3659fb5ac29a5e6102bebe494ac789fd47fb78f4 Mon Sep 17 00:00:00 2001 From: Yanjun Zhang Date: Thu, 22 Dec 2022 09:57:21 +0800 Subject: nvme: fix multipath crash caused by flush request when blktrace is enabled The flush request initialized by blk_kick_flush has NULL bio, and it may be dealt with nvme_end_req during io completion. When blktrace is enabled, nvme_trace_bio_complete with multipath activated trying to access NULL pointer bio from flush request results in the following crash: [ 2517.831677] BUG: kernel NULL pointer dereference, address: 000000000000001a [ 2517.835213] #PF: supervisor read access in kernel mode [ 2517.838724] #PF: error_code(0x0000) - not-present page [ 2517.842222] PGD 7b2d51067 P4D 0 [ 2517.845684] Oops: 0000 [#1] SMP NOPTI [ 2517.849125] CPU: 2 PID: 732 Comm: kworker/2:1H Kdump: loaded Tainted: G S 5.15.67-0.cl9.x86_64 #1 [ 2517.852723] Hardware name: XFUSION 2288H V6/BC13MBSBC, BIOS 1.13 07/27/2022 [ 2517.856358] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [ 2517.859993] RIP: 0010:blk_add_trace_bio_complete+0x6/0x30 [ 2517.863628] Code: 1f 44 00 00 48 8b 46 08 31 c9 ba 04 00 10 00 48 8b 80 50 03 00 00 48 8b 78 50 e9 e5 fe ff ff 0f 1f 44 00 00 41 54 49 89 f4 55 <0f> b6 7a 1a 48 89 d5 e8 3e 1c 2b 00 48 89 ee 4c 89 e7 5d 89 c1 ba [ 2517.871269] RSP: 0018:ff7f6a008d9dbcd0 EFLAGS: 00010286 [ 2517.875081] RAX: ff3d5b4be00b1d50 RBX: 0000000002040002 RCX: ff3d5b0a270f2000 [ 2517.878966] RDX: 0000000000000000 RSI: ff3d5b0b021fb9f8 RDI: 0000000000000000 [ 2517.882849] RBP: ff3d5b0b96a6fa00 R08: 0000000000000001 R09: 0000000000000000 [ 2517.886718] R10: 000000000000000c R11: 000000000000000c R12: ff3d5b0b021fb9f8 [ 2517.890575] R13: 0000000002000000 R14: ff3d5b0b021fb1b0 R15: 0000000000000018 [ 2517.894434] FS: 0000000000000000(0000) GS:ff3d5b42bfc80000(0000) knlGS:0000000000000000 [ 2517.898299] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2517.902157] CR2: 000000000000001a CR3: 00000004f023e005 CR4: 0000000000771ee0 [ 2517.906053] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2517.909930] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2517.913761] PKRU: 55555554 [ 2517.917558] Call Trace: [ 2517.921294] [ 2517.924982] nvme_complete_rq+0x1c3/0x1e0 [nvme_core] [ 2517.928715] nvme_tcp_recv_pdu+0x4d7/0x540 [nvme_tcp] [ 2517.932442] nvme_tcp_recv_skb+0x4f/0x240 [nvme_tcp] [ 2517.936137] ? nvme_tcp_recv_pdu+0x540/0x540 [nvme_tcp] [ 2517.939830] tcp_read_sock+0x9c/0x260 [ 2517.943486] nvme_tcp_try_recv+0x65/0xa0 [nvme_tcp] [ 2517.947173] nvme_tcp_io_work+0x64/0x90 [nvme_tcp] [ 2517.950834] process_one_work+0x1e8/0x390 [ 2517.954473] worker_thread+0x53/0x3c0 [ 2517.958069] ? process_one_work+0x390/0x390 [ 2517.961655] kthread+0x10c/0x130 [ 2517.965211] ? set_kthread_struct+0x40/0x40 [ 2517.968760] ret_from_fork+0x1f/0x30 [ 2517.972285] To avoid this situation, add a NULL check for req->bio before calling trace_block_bio_complete. Signed-off-by: Yanjun Zhang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6bbb73ef8b25..424c8a467a0c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -893,7 +893,7 @@ static inline void nvme_trace_bio_complete(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - if (req->cmd_flags & REQ_NVME_MPATH) + if ((req->cmd_flags & REQ_NVME_MPATH) && req->bio) trace_block_bio_complete(ns->head->disk->queue, req->bio); } -- cgit From 123b99619cca94bdca0bf7bde9abe28f0a0dfe06 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Dec 2022 20:10:12 +0100 Subject: netfilter: nf_tables: honor set timeout and garbage collection updates Set timeout and garbage collection interval updates are ignored on updates. Add transaction to update global set element timeout and garbage collection interval. Fixes: 96518518cc41 ("netfilter: add nftables") Suggested-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 +++++++- net/netfilter/nf_tables_api.c | 63 ++++++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4957b4775757..9430128aae99 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -597,7 +597,9 @@ void *nft_set_catchall_gc(const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { - return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ; + u32 gc_int = READ_ONCE(set->gc_int); + + return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** @@ -1570,6 +1572,9 @@ struct nft_trans_rule { struct nft_trans_set { struct nft_set *set; u32 set_id; + u32 gc_int; + u64 timeout; + bool update; bool bound; }; @@ -1579,6 +1584,12 @@ struct nft_trans_set { (((struct nft_trans_set *)trans->data)->set_id) #define nft_trans_set_bound(trans) \ (((struct nft_trans_set *)trans->data)->bound) +#define nft_trans_set_update(trans) \ + (((struct nft_trans_set *)trans->data)->update) +#define nft_trans_set_timeout(trans) \ + (((struct nft_trans_set *)trans->data)->timeout) +#define nft_trans_set_gc_int(trans) \ + (((struct nft_trans_set *)trans->data)->gc_int) struct nft_trans_chain { bool update; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 319887f4d3ef..8c09e4d12ac1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -465,8 +465,9 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, - struct nft_set *set) +static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set, + const struct nft_set_desc *desc) { struct nft_trans *trans; @@ -474,17 +475,28 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, if (trans == NULL) return -ENOMEM; - if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; + if (desc) { + nft_trans_set_update(trans) = true; + nft_trans_set_gc_int(trans) = desc->gc_int; + nft_trans_set_timeout(trans) = desc->timeout; + } nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } +static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + return __nft_trans_set_add(ctx, msg_type, set, NULL); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -4044,8 +4056,10 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nlmsghdr *nlh; + u64 timeout = READ_ONCE(set->timeout); + u32 gc_int = READ_ONCE(set->gc_int); u32 portid = ctx->portid; + struct nlmsghdr *nlh; struct nlattr *nest; u32 seq = ctx->seq; int i; @@ -4081,13 +4095,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) goto nla_put_failure; - if (set->timeout && + if (timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - nf_jiffies64_to_msecs(set->timeout), + nf_jiffies64_to_msecs(timeout), NFTA_SET_PAD)) goto nla_put_failure; - if (set->gc_int && - nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + if (gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int))) goto nla_put_failure; if (set->policy != NFT_SET_POL_PERFORMANCE) { @@ -4632,7 +4646,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, for (i = 0; i < num_exprs; i++) nft_expr_destroy(&ctx, exprs[i]); - return err; + if (err < 0) + return err; + + return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc); } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -6070,7 +6087,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { - timeout = set->timeout; + timeout = READ_ONCE(set->timeout); } expiration = 0; @@ -6171,7 +6188,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_parse_key_end; - if (timeout != set->timeout) { + if (timeout != READ_ONCE(set->timeout)) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; @@ -9093,14 +9110,20 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: - nft_clear(net, nft_trans_set(trans)); - /* This avoids hitting -EBUSY when deleting the table - * from the transaction. - */ - if (nft_set_is_anonymous(nft_trans_set(trans)) && - !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + if (nft_trans_set_update(trans)) { + struct nft_set *set = nft_trans_set(trans); + WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); + WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); + } else { + nft_clear(net, nft_trans_set(trans)); + /* This avoids hitting -EBUSY when deleting the table + * from the transaction. + */ + if (nft_set_is_anonymous(nft_trans_set(trans)) && + !list_empty(&nft_trans_set(trans)->bindings)) + trans->ctx.table->use--; + } nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); @@ -9322,6 +9345,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: + if (nft_trans_set_update(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); -- cgit From 42c7ded0eeacd2ba5db599205c71c279dc715de7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Dec 2022 13:08:31 +0000 Subject: bonding: fix lockdep splat in bond_miimon_commit() bond_miimon_commit() is run while RTNL is held, not RCU. WARNING: suspicious RCU usage 6.1.0-syzkaller-09671-g89529367293c #0 Not tainted ----------------------------- drivers/net/bonding/bond_main.c:2704 suspicious rcu_dereference_check() usage! Fixes: e95cc44763a4 ("bonding: do failover when high prio link up") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Hangbin Liu Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Link: https://lore.kernel.org/r/20221220130831.1480888-1-edumazet@google.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b4c65783960a..0363ce597661 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2654,10 +2654,12 @@ static void bond_miimon_link_change(struct bonding *bond, static void bond_miimon_commit(struct bonding *bond) { - struct slave *slave, *primary; + struct slave *slave, *primary, *active; bool do_failover = false; struct list_head *iter; + ASSERT_RTNL(); + bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: @@ -2700,8 +2702,8 @@ static void bond_miimon_commit(struct bonding *bond) bond_miimon_link_change(bond, slave, BOND_LINK_UP); - if (!rcu_access_pointer(bond->curr_active_slave) || slave == primary || - slave->prio > rcu_dereference(bond->curr_active_slave)->prio) + active = rtnl_dereference(bond->curr_active_slave); + if (!active || slave == primary || slave->prio > active->prio) do_failover = true; continue; -- cgit From d717f9474e3fb7e6bd3e43ca16e131f04320ed6f Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 21 Dec 2022 10:33:15 +0100 Subject: net: lan966x: Fix configuration of the PCS When the PCS was taken out of reset, we were changing by mistake also the speed to 100 Mbit. But in case the link was going down, the link up routine was setting correctly the link speed. If the link was not getting down then the speed was forced to run at 100 even if the speed was something else. On lan966x, to set the speed link to 1G or 2.5G a value of 1 needs to be written in DEV_CLOCK_CFG_LINK_SPEED. This is similar to the procedure in lan966x_port_init. The issue was reproduced using 1000base-x sfp module using the commands: ip link set dev eth2 up ip link addr add 10.97.10.2/24 dev eth2 ethtool -s eth2 speed 1000 autoneg off Fixes: d28d6d2e37d1 ("net: lan966x: add port module support") Signed-off-by: Horatiu Vultur Reviewed-by: Piotr Raczynski Link: https://lore.kernel.org/r/20221221093315.939133-1-horatiu.vultur@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/lan966x/lan966x_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_port.c b/drivers/net/ethernet/microchip/lan966x/lan966x_port.c index 1a61c6cdb077..0050fcb988b7 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_port.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_port.c @@ -381,7 +381,7 @@ int lan966x_port_pcs_set(struct lan966x_port *port, } /* Take PCS out of reset */ - lan_rmw(DEV_CLOCK_CFG_LINK_SPEED_SET(2) | + lan_rmw(DEV_CLOCK_CFG_LINK_SPEED_SET(LAN966X_SPEED_1000) | DEV_CLOCK_CFG_PCS_RX_RST_SET(0) | DEV_CLOCK_CFG_PCS_TX_RST_SET(0), DEV_CLOCK_CFG_LINK_SPEED | -- cgit From fa349e396e4886d742fd6501c599ec627ef1353b Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Tue, 20 Dec 2022 12:59:03 -0600 Subject: veth: Fix race with AF_XDP exposing old or uninitialized descriptors When AF_XDP is used on on a veth interface the RX ring is updated in two steps. veth_xdp_rcv() removes packet descriptors from the FILL ring fills them and places them in the RX ring updating the cached_prod pointer. Later xdp_do_flush() syncs the RX ring prod pointer with the cached_prod pointer allowing user-space to see the recently filled in descriptors. The rings are intended to be SPSC, however the existing order in veth_poll allows the xdp_do_flush() to run concurrently with another CPU creating a race condition that allows user-space to see old or uninitialized descriptors in the RX ring. This bug has been observed in production systems. To summarize, we are expecting this ordering: CPU 0 __xsk_rcv_zc() CPU 0 __xsk_map_flush() CPU 2 __xsk_rcv_zc() CPU 2 __xsk_map_flush() But we are seeing this order: CPU 0 __xsk_rcv_zc() CPU 2 __xsk_rcv_zc() CPU 0 __xsk_map_flush() CPU 2 __xsk_map_flush() This occurs because we rely on NAPI to ensure that only one napi_poll handler is running at a time for the given veth receive queue. napi_schedule_prep() will prevent multiple instances from getting scheduled. However calling napi_complete_done() signals that this napi_poll is complete and allows subsequent calls to napi_schedule_prep() and __napi_schedule() to succeed in scheduling a concurrent napi_poll before the xdp_do_flush() has been called. For the veth driver a concurrent call to napi_schedule_prep() and __napi_schedule() can occur on a different CPU because the veth xmit path can additionally schedule a napi_poll creating the race. The fix as suggested by Magnus Karlsson, is to simply move the xdp_do_flush() call before napi_complete_done(). This syncs the producer ring pointers before another instance of napi_poll can be scheduled on another CPU. It will also slightly improve performance by moving the flush closer to when the descriptors were placed in the RX ring. Fixes: d1396004dd86 ("veth: Add XDP TX and REDIRECT") Suggested-by: Magnus Karlsson Signed-off-by: Shawn Bohrer Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ac7c0653695f..dfc7d87fad59 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -974,6 +974,9 @@ static int veth_poll(struct napi_struct *napi, int budget) xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); + if (stats.xdp_redirect > 0) + xdp_do_flush(); + if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); @@ -987,8 +990,6 @@ static int veth_poll(struct napi_struct *napi, int budget) if (stats.xdp_tx > 0) veth_xdp_flush(rq, &bq); - if (stats.xdp_redirect > 0) - xdp_do_flush(); xdp_clear_return_frame_no_direct(); return done; -- cgit From 789e1e10f214c00ca18fc6610824c5b9876ba5f2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 22 Dec 2022 09:51:30 -0500 Subject: nfsd: shut down the NFSv4 state objects before the filecache Currently, we shut down the filecache before trying to clean up the stateids that depend on it. This leads to the kernel trying to free an nfsd_file twice, and a refcount overput on the nf_mark. Change the shutdown procedure to tear down all of the stateids prior to shutting down the filecache. Reported-and-tested-by: Wang Yugui Signed-off-by: Jeff Layton Fixes: 5e113224c17e ("nfsd: nfsd_file cache entries should be per net namespace") Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 56fba1cba3af..325d3d3f1211 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -453,8 +453,8 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfsd_file_cache_shutdown_net(net); nfs4_state_shutdown_net(net); + nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; -- cgit From 00a734104af7d878f1252d49eff9298785c6cbdc Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 8 Dec 2022 10:42:05 -0600 Subject: ACPI: video: Allow GPU drivers to report no panels The current logic for the ACPI backlight detection will create a backlight device if no native or vendor drivers have created 8 seconds after the system has booted if the ACPI tables included backlight control methods. If the GPU drivers have loaded, they may be able to report whether any LCD panels were found. Allow using this information to factor in whether to enable the fallback logic for making an acpi_video0 backlight device. Suggested-by: Hans de Goede Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 11 +++++++++++ include/acpi/video.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 30d8fd03fec7..75dc37affff2 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -2176,6 +2176,17 @@ static bool should_check_lcd_flag(void) return false; } +/* + * At least one graphics driver has reported that no LCD is connected + * via the native interface. cancel the registration for fallback acpi_video0. + * If another driver still deems this necessary, it can explicitly register it. + */ +void acpi_video_report_nolcd(void) +{ + cancel_delayed_work(&video_bus_register_backlight_work); +} +EXPORT_SYMBOL(acpi_video_report_nolcd); + int acpi_video_register(void) { int ret = 0; diff --git a/include/acpi/video.h b/include/acpi/video.h index a275c35e5249..8ed9bec03e53 100644 --- a/include/acpi/video.h +++ b/include/acpi/video.h @@ -53,6 +53,7 @@ enum acpi_backlight_type { }; #if IS_ENABLED(CONFIG_ACPI_VIDEO) +extern void acpi_video_report_nolcd(void); extern int acpi_video_register(void); extern void acpi_video_unregister(void); extern void acpi_video_register_backlight(void); @@ -69,6 +70,7 @@ extern int acpi_video_get_levels(struct acpi_device *device, struct acpi_video_device_brightness **dev_br, int *pmax_level); #else +static inline void acpi_video_report_nolcd(void) { return; }; static inline int acpi_video_register(void) { return -ENODEV; } static inline void acpi_video_unregister(void) { return; } static inline void acpi_video_register_backlight(void) { return; } -- cgit From c573e240609ff781a0246c0c8c8351abd0475287 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 8 Dec 2022 10:42:06 -0600 Subject: drm/amd/display: Report to ACPI video if no panels were found On desktop APUs amdgpu doesn't create a native backlight device as no eDP panels are found. However if the BIOS has reported backlight control methods in the ACPI tables then an acpi_video0 backlight device will be made 8 seconds after boot. This has manifested in a power slider on a number of desktop APUs ranging from Ryzen 5000 through Ryzen 7000 on various motherboard manufacturers. To avoid this, report to the acpi video detection that the system does not have any panel connected in the native driver. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1783786 Reported-by: Hans de Goede Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 77277d90b6e2..a7eb13902af8 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4360,6 +4360,10 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) amdgpu_set_panel_orientation(&aconnector->base); } + /* If we didn't find a panel, notify the acpi video detection */ + if (dm->adev->flags & AMD_IS_APU && dm->num_of_edps == 0) + acpi_video_report_nolcd(); + /* Software is initialized. Now we can register interrupt handlers. */ switch (adev->asic_type) { #if defined(CONFIG_DRM_AMD_DC_SI) -- cgit From 5aa9d943e9b6bf6e6023645cbe7ce7d5ed84baf4 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 8 Dec 2022 10:42:07 -0600 Subject: ACPI: video: Don't enable fallback path for creating ACPI backlight by default The ACPI video detection code has a module parameter `register_backlight_delay` which is currently configured to 8 seconds. This means that if after 8 seconds of booting no native driver has created a backlight device then the code will attempt to make an ACPI video backlight device. This was intended as a safety mechanism with the backlight overhaul that occurred in kernel 6.1, but as it doesn't appear necesssary set it to be disabled by default. Suggested-by: Hans de Goede Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 75dc37affff2..97b711e57bff 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -70,11 +70,7 @@ module_param(device_id_scheme, bool, 0444); static int only_lcd = -1; module_param(only_lcd, int, 0444); -/* - * Display probing is known to take up to 5 seconds, so delay the fallback - * backlight registration by 5 seconds + 3 seconds for some extra margin. - */ -static int register_backlight_delay = 8; +static int register_backlight_delay; module_param(register_backlight_delay, int, 0444); MODULE_PARM_DESC(register_backlight_delay, "Delay in seconds before doing fallback (non GPU driver triggered) " -- cgit From 7592b79ba4a91350b38469e05238308bcfe1019b Mon Sep 17 00:00:00 2001 From: Erik Schumacher Date: Sun, 11 Dec 2022 14:33:22 +0100 Subject: ACPI: resource: do IRQ override on XMG Core 15 The Schenker XMG CORE 15 (M22) is Ryzen-6 based and needs IRQ overriding for the keyboard to work. Adding an entry for this laptop to the override_table makes the internal keyboard functional again. Signed-off-by: Erik Schumacher Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index f27914aedbd5..037d1aa10357 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -446,6 +446,17 @@ static const struct dmi_system_id lenovo_82ra[] = { { } }; +static const struct dmi_system_id schenker_gm_rg[] = { + { + .ident = "XMG CORE 15 (M22)", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SchenkerTechnologiesGmbH"), + DMI_MATCH(DMI_BOARD_NAME, "GMxRGxx"), + }, + }, + { } +}; + struct irq_override_cmp { const struct dmi_system_id *system; unsigned char irq; @@ -460,6 +471,7 @@ static const struct irq_override_cmp override_table[] = { { asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false }, { lenovo_82ra, 6, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, { lenovo_82ra, 10, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, + { schenker_gm_rg, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true }, }; static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity, -- cgit From f3cb9b740869712d448edf3b9ef5952b847caf8b Mon Sep 17 00:00:00 2001 From: Adrian Freund Date: Tue, 13 Dec 2022 21:13:11 +0100 Subject: ACPI: resource: do IRQ override on Lenovo 14ALC7 Commit bfcdf58380b1 ("ACPI: resource: do IRQ override on LENOVO IdeaPad") added an override for Lenovo IdeaPad 5 16ALC7. The 14ALC7 variant also suffers from a broken touchscreen and trackpad. Fixes: 9946e39fe8d0 ("ACPI: resource: skip IRQ override on AMD Zen platforms") Link: https://bugzilla.kernel.org/show_bug.cgi?id=216804 Signed-off-by: Adrian Freund Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 037d1aa10357..d0c92422e206 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -435,7 +435,14 @@ static const struct dmi_system_id asus_laptop[] = { { } }; -static const struct dmi_system_id lenovo_82ra[] = { +static const struct dmi_system_id lenovo_laptop[] = { + { + .ident = "LENOVO IdeaPad Flex 5 14ALC7", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "82R9"), + }, + }, { .ident = "LENOVO IdeaPad Flex 5 16ALC7", .matches = { @@ -469,8 +476,8 @@ struct irq_override_cmp { static const struct irq_override_cmp override_table[] = { { medion_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false }, { asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false }, - { lenovo_82ra, 6, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, - { lenovo_82ra, 10, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, + { lenovo_laptop, 6, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, + { lenovo_laptop, 10, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, true }, { schenker_gm_rg, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true }, }; -- cgit From 7203481fd12b1257938519efb2460ea02b9236ee Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 15 Dec 2022 10:44:43 +0100 Subject: ACPI: resource: Add Asus ExpertBook B2502 to Asus quirks The Asus ExpertBook B2502 has the same keyboard issue as Asus Vivobook K3402ZA/K3502ZA. The kernel overrides IRQ 1 to Edge_High when it should be Active_Low. This patch adds the ExpertBook B2502 model to the existing quirk list of Asus laptops with this issue. Fixes: b5f9223a105d ("ACPI: resource: Skip IRQ override on Asus Vivobook S5602ZA") Link: https://bugzilla.redhat.com/show_bug.cgi?id=2142574 Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index d0c92422e206..16dcd31d124f 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -432,6 +432,13 @@ static const struct dmi_system_id asus_laptop[] = { DMI_MATCH(DMI_BOARD_NAME, "S5602ZA"), }, }, + { + .ident = "Asus ExpertBook B2502", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "B2502CBA"), + }, + }, { } }; -- cgit From 3cf3b7f012f3ea8bdc56196e367cf07c10424855 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 15 Dec 2022 10:41:38 +0100 Subject: ACPI: video: Fix Apple GMUX backlight detection The apple-gmux driver only binds to old GMUX devices which have an IORESOURCE_IO resource (using inb()/outb()) rather then memory-mapped IO (IORESOURCE_MEM). T2 MacBooks use the new style GMUX devices (with IORESOURCE_MEM access), so these are not supported by the apple-gmux driver. This is not a problem since they have working ACPI video backlight support. But the apple_gmux_present() helper only checks if an ACPI device with the "APP000B" HID is present, causing acpi_video_get_backlight_type() to return acpi_backlight_apple_gmux disabling the acpi_video backlight device. Add a new apple_gmux_backlight_present() helper which checks that the "APP000B" device actually is an old GMUX device with an IORESOURCE_IO resource. This fixes the acpi_video0 backlight no longer registering on T2 MacBooks. Note people are working to add support for the new style GMUX to Linux: https://github.com/kekrby/linux-t2/commits/wip/hybrid-graphics Once this lands this patch should be reverted so that acpi_video_get_backlight_type() also prefers the gmux on new style GMUX MacBooks, but for now this is necessary to avoid regressing backlight control on T2 Macs. Fixes: 21245df307cb ("ACPI: video: Add Apple GMUX brightness control detection") Reported-and-tested-by: Aditya Garg Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/video_detect.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index a934bbc9dd37..1b78c7434492 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -105,6 +106,26 @@ static bool nvidia_wmi_ec_supported(void) } #endif +static bool apple_gmux_backlight_present(void) +{ + struct acpi_device *adev; + struct device *dev; + + adev = acpi_dev_get_first_match_dev(GMUX_ACPI_HID, NULL, -1); + if (!adev) + return false; + + dev = acpi_get_first_physical_node(adev); + if (!dev) + return false; + + /* + * drivers/platform/x86/apple-gmux.c only supports old style + * Apple GMUX with an IO-resource. + */ + return pnp_get_resource(to_pnp_dev(dev), IORESOURCE_IO, 0) != NULL; +} + /* Force to use vendor driver when the ACPI device is known to be * buggy */ static int video_detect_force_vendor(const struct dmi_system_id *d) @@ -767,7 +788,7 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native) if (nvidia_wmi_ec_present) return acpi_backlight_nvidia_wmi_ec; - if (apple_gmux_present()) + if (apple_gmux_backlight_present()) return acpi_backlight_apple_gmux; /* Use ACPI video if available, except when native should be preferred. */ -- cgit From 3ea45390e9c0d35805ef8357ace55594fd4233d0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 15 Dec 2022 13:16:15 -0600 Subject: ACPI: x86: s2idle: Force AMD GUID/_REV 2 on HP Elitebook 865 HP Elitebook 865 supports both the AMD GUID w/ _REV 2 and Microsoft GUID with _REV 0. Both have very similar code but the AMD GUID has a special workaround that is specific to a problem with spurious wakeups on systems with Qualcomm WLAN. This is believed to be a bug in the Qualcomm WLAN F/W (it doesn't affect any other WLAN H/W). If this WLAN firmware is fixed this quirk can be dropped. Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- drivers/acpi/x86/s2idle.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index 5350c73564b6..422415cb14f4 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -401,6 +401,13 @@ static const struct acpi_device_id amd_hid_ids[] = { {} }; +static int lps0_prefer_amd(const struct dmi_system_id *id) +{ + pr_debug("Using AMD GUID w/ _REV 2.\n"); + rev_id = 2; + return 0; +} + static int lps0_prefer_microsoft(const struct dmi_system_id *id) { pr_debug("Preferring Microsoft GUID.\n"); @@ -462,6 +469,19 @@ static const struct dmi_system_id s2idle_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow X16 GV601"), }, }, + { + /* + * AMD Rembrandt based HP EliteBook 835/845/865 G9 + * Contains specialized AML in AMD/_REV 2 path to avoid + * triggering a bug in Qualcomm WLAN firmware. This may be + * removed in the future if that firmware is fixed. + */ + .callback = lps0_prefer_amd, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "HP"), + DMI_MATCH(DMI_BOARD_NAME, "8990"), + }, + }, {} }; -- cgit From e555c85792bd5f9828a2fd2ca9761f70efb1c77b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 15 Dec 2022 13:16:16 -0600 Subject: ACPI: x86: s2idle: Stop using AMD specific codepath for Rembrandt+ After we introduced a module parameter and quirk infrastructure for picking the Microsoft GUID over the SOC vendor GUID we discovered that lots and lots of systems are getting this wrong. The table continues to grow, and is becoming unwieldy. We don't really have any benefit to forcing vendors to populate the AMD GUID. This is just extra work, and more and more vendors seem to mess it up. As the Microsoft GUID is used by Windows as well, it's very likely that it won't be messed up like this. So drop all the quirks forcing it and the Rembrandt behavior. This means that Cezanne or later effectively only run the Microsoft GUID codepath with the exception of HP Elitebook 8*5 G9. Fixes: fd894f05cf30 ("ACPI: x86: s2idle: If a new AMD _HID is missing assume Rembrandt") Cc: stable@vger.kernel.org # 6.1 Reported-by: Benjamin Cheng Reported-by: bilkow@tutanota.com Reported-by: Paul Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2292 Link: https://bugzilla.kernel.org/show_bug.cgi?id=216768 Signed-off-by: Mario Limonciello Reviewed-by: Philipp Zabel Tested-by: Philipp Zabel Signed-off-by: Rafael J. Wysocki --- drivers/acpi/x86/s2idle.c | 87 ++--------------------------------------------- 1 file changed, 3 insertions(+), 84 deletions(-) diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index 422415cb14f4..c7afce465a07 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -28,10 +28,6 @@ static bool sleep_no_lps0 __read_mostly; module_param(sleep_no_lps0, bool, 0644); MODULE_PARM_DESC(sleep_no_lps0, "Do not use the special LPS0 device interface"); -static bool prefer_microsoft_dsm_guid __read_mostly; -module_param(prefer_microsoft_dsm_guid, bool, 0644); -MODULE_PARM_DESC(prefer_microsoft_dsm_guid, "Prefer using Microsoft GUID in LPS0 device _DSM evaluation"); - static const struct acpi_device_id lps0_device_ids[] = { {"PNP0D80", }, {"", }, @@ -369,27 +365,15 @@ out: } struct amd_lps0_hid_device_data { - const unsigned int rev_id; const bool check_off_by_one; - const bool prefer_amd_guid; }; static const struct amd_lps0_hid_device_data amd_picasso = { - .rev_id = 0, .check_off_by_one = true, - .prefer_amd_guid = false, }; static const struct amd_lps0_hid_device_data amd_cezanne = { - .rev_id = 0, .check_off_by_one = false, - .prefer_amd_guid = false, -}; - -static const struct amd_lps0_hid_device_data amd_rembrandt = { - .rev_id = 2, - .check_off_by_one = false, - .prefer_amd_guid = true, }; static const struct acpi_device_id amd_hid_ids[] = { @@ -397,7 +381,6 @@ static const struct acpi_device_id amd_hid_ids[] = { {"AMD0005", (kernel_ulong_t)&amd_picasso, }, {"AMDI0005", (kernel_ulong_t)&amd_picasso, }, {"AMDI0006", (kernel_ulong_t)&amd_cezanne, }, - {"AMDI0007", (kernel_ulong_t)&amd_rembrandt, }, {} }; @@ -407,68 +390,7 @@ static int lps0_prefer_amd(const struct dmi_system_id *id) rev_id = 2; return 0; } - -static int lps0_prefer_microsoft(const struct dmi_system_id *id) -{ - pr_debug("Preferring Microsoft GUID.\n"); - prefer_microsoft_dsm_guid = true; - return 0; -} - static const struct dmi_system_id s2idle_dmi_table[] __initconst = { - { - /* - * ASUS TUF Gaming A17 FA707RE - * https://bugzilla.kernel.org/show_bug.cgi?id=216101 - */ - .callback = lps0_prefer_microsoft, - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "ASUS TUF Gaming A17"), - }, - }, - { - /* ASUS ROG Zephyrus G14 (2022) */ - .callback = lps0_prefer_microsoft, - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "ROG Zephyrus G14 GA402"), - }, - }, - { - /* - * Lenovo Yoga Slim 7 Pro X 14ARH7 - * https://bugzilla.kernel.org/show_bug.cgi?id=216473 : 82V2 - * https://bugzilla.kernel.org/show_bug.cgi?id=216438 : 82TL - */ - .callback = lps0_prefer_microsoft, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_NAME, "82"), - }, - }, - { - /* - * ASUSTeK COMPUTER INC. ROG Flow X13 GV301RE_GV301RE - * https://gitlab.freedesktop.org/drm/amd/-/issues/2148 - */ - .callback = lps0_prefer_microsoft, - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow X13 GV301"), - }, - }, - { - /* - * ASUSTeK COMPUTER INC. ROG Flow X16 GV601RW_GV601RW - * https://gitlab.freedesktop.org/drm/amd/-/issues/2148 - */ - .callback = lps0_prefer_microsoft, - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow X16 GV601"), - }, - }, { /* * AMD Rembrandt based HP EliteBook 835/845/865 G9 @@ -504,16 +426,14 @@ static int lps0_device_attach(struct acpi_device *adev, if (dev_id->id[0]) data = (const struct amd_lps0_hid_device_data *) dev_id->driver_data; else - data = &amd_rembrandt; - rev_id = data->rev_id; + data = &amd_cezanne; lps0_dsm_func_mask = validate_dsm(adev->handle, ACPI_LPS0_DSM_UUID_AMD, rev_id, &lps0_dsm_guid); if (lps0_dsm_func_mask > 0x3 && data->check_off_by_one) { lps0_dsm_func_mask = (lps0_dsm_func_mask << 1) | 0x1; acpi_handle_debug(adev->handle, "_DSM UUID %s: Adjusted function mask: 0x%x\n", ACPI_LPS0_DSM_UUID_AMD, lps0_dsm_func_mask); - } else if (lps0_dsm_func_mask_microsoft > 0 && data->prefer_amd_guid && - !prefer_microsoft_dsm_guid) { + } else if (lps0_dsm_func_mask_microsoft > 0 && rev_id) { lps0_dsm_func_mask_microsoft = -EINVAL; acpi_handle_debug(adev->handle, "_DSM Using AMD method\n"); } @@ -521,8 +441,7 @@ static int lps0_device_attach(struct acpi_device *adev, rev_id = 1; lps0_dsm_func_mask = validate_dsm(adev->handle, ACPI_LPS0_DSM_UUID, rev_id, &lps0_dsm_guid); - if (!prefer_microsoft_dsm_guid) - lps0_dsm_func_mask_microsoft = -EINVAL; + lps0_dsm_func_mask_microsoft = -EINVAL; } if (lps0_dsm_func_mask < 0 && lps0_dsm_func_mask_microsoft < 0) -- cgit From 55171f2930be98c8a49991435cdf3a8b574353b6 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Thu, 22 Dec 2022 10:26:27 +0000 Subject: bpftool: Fix linkage with statically built libllvm Since the commit eb9d1acf634b ("bpftool: Add LLVM as default library for disassembling JIT-ed programs") we might link the bpftool program with the libllvm library. This works fine when a shared libllvm library is available, but fails if we want to link bpftool with a statically built LLVM: [...] /usr/bin/ld: /usr/local/lib/libLLVMSupport.a(CrashRecoveryContext.cpp.o): in function `llvm::CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup()': CrashRecoveryContext.cpp:(.text._ZN4llvm27CrashRecoveryContextCleanupD0Ev+0x17): undefined reference to `operator delete(void*, unsigned long)' /usr/bin/ld: /usr/local/lib/libLLVMSupport.a(CrashRecoveryContext.cpp.o): in function `llvm::CrashRecoveryContext::~CrashRecoveryContext()': CrashRecoveryContext.cpp:(.text._ZN4llvm20CrashRecoveryContextD2Ev+0xc8): undefined reference to `operator delete(void*, unsigned long)' [...] So in the case of static libllvm we need to explicitly link bpftool with required libraries, namely, libstdc++ and those provided by the `llvm-config --system-libs` command. We can distinguish between the shared and static cases by using the `llvm-config --shared-mode` command. Fixes: eb9d1acf634b ("bpftool: Add LLVM as default library for disassembling JIT-ed programs") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221222102627.1643709-1-aspsk@isovalent.com --- tools/bpf/bpftool/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 787b857d3fb5..f610e184ce02 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -145,6 +145,10 @@ ifeq ($(feature-llvm),1) LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets CFLAGS += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS)) LIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) + LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LIBS += -lstdc++ + endif LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags) else # Fall back on libbfd -- cgit From 8374bfd5a3c90a5b250f7c087c4d2b8ac467b12e Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Thu, 22 Dec 2022 10:44:13 +0800 Subject: bpf: fix nullness propagation for reg to reg comparisons After befae75856ab, the verifier would propagate null information after JEQ/JNE, e.g., if two pointers, one is maybe_null and the other is not, the former would be marked as non-null in eq path. However, as comment "PTR_TO_BTF_ID points to a kernel struct that does not need to be null checked by the BPF program ... The verifier must keep this in mind and can make no assumptions about null or non-null when doing branch ...". If one pointer is maybe_null and the other is PTR_TO_BTF, the former is incorrectly marked non-null. The following BPF prog can trigger a null-ptr-deref, also see this report for more details[1]: 0: (18) r1 = map_fd ; R1_w=map_ptr(ks=4, vs=4) 2: (79) r6 = *(u64 *)(r1 +8) ; R6_w=bpf_map->inner_map_data ; R6 is PTR_TO_BTF_ID ; equals to null at runtime 3: (bf) r2 = r10 4: (07) r2 += -4 5: (62) *(u32 *)(r2 +0) = 0 6: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null 7: (1d) if r6 == r0 goto pc+1 8: (95) exit ; from 7 to 9: R0=map_value R6=ptr_bpf_map 9: (61) r0 = *(u32 *)(r0 +0) ; null-ptr-deref 10: (95) exit So, make the verifier propagate nullness information for reg to reg comparisons only if neither reg is PTR_TO_BTF_ID. [1] https://lore.kernel.org/bpf/CACkBjsaFJwjC5oiw-1KXvcazywodwXo4zGYsRHwbr2gSG9WcSw@mail.gmail.com/T/#u Fixes: befae75856ab ("bpf: propagate nullness information for reg to reg comparisons") Signed-off-by: Hao Sun Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221222024414.29539-1-sunhao.th@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a5255a0dcbb6..243d06ce6842 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11822,10 +11822,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * register B - not null * for JNE A, B, ... - A is not null in the false branch; * for JEQ A, B, ... - A is not null in the true branch. + * + * Since PTR_TO_BTF_ID points to a kernel struct that does + * not need to be null checked by the BPF program, i.e., + * could be null even without PTR_MAYBE_NULL marking, so + * only propagate nullness when neither reg is that type. */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && - type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { + type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) && + base_type(src_reg->type) != PTR_TO_BTF_ID && + base_type(dst_reg->type) != PTR_TO_BTF_ID) { eq_branch_regs = NULL; switch (opcode) { case BPF_JEQ: -- cgit From cedebd74cf3883f0384af9ec26b4e6f8f1964dd4 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Thu, 22 Dec 2022 10:44:14 +0800 Subject: selftests/bpf: check null propagation only neither reg is PTR_TO_BTF_ID Verify that nullness information is not porpagated in the branches of register to register JEQ and JNE operations if one of them is PTR_TO_BTF_ID. Implement this in C level so we can use CO-RE. Signed-off-by: Hao Sun Suggested-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221222024414.29539-2-sunhao.th@gmail.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/jeq_infer_not_null.c | 9 +++++ .../selftests/bpf/progs/jeq_infer_not_null_fail.c | 42 ++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/jeq_infer_not_null.c create mode 100644 tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/jeq_infer_not_null.c b/tools/testing/selftests/bpf/prog_tests/jeq_infer_not_null.c new file mode 100644 index 000000000000..3add34df5767 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/jeq_infer_not_null.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "jeq_infer_not_null_fail.skel.h" + +void test_jeq_infer_not_null(void) +{ + RUN_TESTS(jeq_infer_not_null_fail); +} diff --git a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c new file mode 100644 index 000000000000..f46965053acb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, u64); + __type(value, u64); +} m_hash SEC(".maps"); + +SEC("?raw_tp") +__failure __msg("R8 invalid mem access 'map_value_or_null") +int jeq_infer_not_null_ptr_to_btfid(void *ctx) +{ + struct bpf_map *map = (struct bpf_map *)&m_hash; + struct bpf_map *inner_map = map->inner_map_meta; + u64 key = 0, ret = 0, *val; + + val = bpf_map_lookup_elem(map, &key); + /* Do not mark ptr as non-null if one of them is + * PTR_TO_BTF_ID (R9), reject because of invalid + * access to map value (R8). + * + * Here, we need to inline those insns to access + * R8 directly, since compiler may use other reg + * once it figures out val==inner_map. + */ + asm volatile("r8 = %[val];\n" + "r9 = %[inner_map];\n" + "if r8 != r9 goto +1;\n" + "%[ret] = *(u64 *)(r8 +0);\n" + : [ret] "+r"(ret) + : [inner_map] "r"(inner_map), [val] "r"(val) + : "r8", "r9"); + + return ret; +} -- cgit From 8d8bee13ae9e316443c6666286360126a19c8d94 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 16 Dec 2022 12:29:37 -0500 Subject: powerpc: dts: t208x: Disable 10G on MAC1 and MAC2 There aren't enough resources to run these ports at 10G speeds. Disable 10G for these ports, reverting to the previous speed. Fixes: 36926a7d70c2 ("powerpc: dts: t208x: Mark MAC1 and MAC2 as 10G") Reported-by: Camelia Alexandra Groza Signed-off-by: Sean Anderson Reviewed-by: Camelia Groza Tested-by: Camelia Groza Link: https://lore.kernel.org/r/20221216172937.2960054-1-sean.anderson@seco.com Signed-off-by: Jakub Kicinski --- arch/powerpc/boot/dts/fsl/t2081si-post.dtsi | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi index 74e17e134387..27714dc2f04a 100644 --- a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi @@ -659,3 +659,19 @@ interrupts = <16 2 1 9>; }; }; + +&fman0_rx_0x08 { + /delete-property/ fsl,fman-10g-port; +}; + +&fman0_tx_0x28 { + /delete-property/ fsl,fman-10g-port; +}; + +&fman0_rx_0x09 { + /delete-property/ fsl,fman-10g-port; +}; + +&fman0_tx_0x29 { + /delete-property/ fsl,fman-10g-port; +}; -- cgit From 7fac54b93ad13e5e7ac237af33eb2a0940eaeea0 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Wed, 21 Dec 2022 20:36:27 +0800 Subject: atm: uapi: fix spelling typos in comments Fix the typo of 'Unsuported' in atmbr2684.h Signed-off-by: Rong Tao Link: https://lore.kernel.org/r/tencent_F1354BEC925C65EA357E741E91DF2044E805@qq.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/atmbr2684.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/atmbr2684.h b/include/uapi/linux/atmbr2684.h index a9e2250cd720..d47c47d06f11 100644 --- a/include/uapi/linux/atmbr2684.h +++ b/include/uapi/linux/atmbr2684.h @@ -38,7 +38,7 @@ */ #define BR2684_ENCAPS_VC (0) /* VC-mux */ #define BR2684_ENCAPS_LLC (1) -#define BR2684_ENCAPS_AUTODETECT (2) /* Unsuported */ +#define BR2684_ENCAPS_AUTODETECT (2) /* Unsupported */ /* * Is this VC bridged or routed? -- cgit From 343190841a1f22b96996d9f8cfab902a4d1bfd0e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Dec 2022 06:37:08 -0700 Subject: io_uring: check for valid register opcode earlier We only check the register opcode value inside the restricted ring section, move it into the main io_uring_register() function instead and check it up front. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ac5d39eeb3d1..58ac13b69dc8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4020,8 +4020,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -EEXIST; if (ctx->restricted) { - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; @@ -4177,6 +4175,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, long ret = -EBADF; struct fd f; + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + f = fdget(fd); if (!f.file) return -EBADF; -- cgit From 09e6b30eeb254f1818a008cace3547159e908dfd Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Thu, 22 Dec 2022 14:43:41 +0800 Subject: net: hns3: add interrupts re-initialization while doing VF FLR Currently keep alive message between PF and VF may be lost and the VF is unalive in PF. So the VF will not do reset during PF FLR reset process. This would make the allocated interrupt resources of VF invalid and VF would't receive or respond to PF any more. So this patch adds VF interrupts re-initialization during VF FLR for VF recovery in above cases. Fixes: 862d969a3a4d ("net: hns3: do VF's pci re-initialization while PF doing FLR") Signed-off-by: Jie Wang Signed-off-by: Hao Lan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index db6f7cdba958..081bd2c3f289 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2767,7 +2767,8 @@ static int hclgevf_pci_reset(struct hclgevf_dev *hdev) struct pci_dev *pdev = hdev->pdev; int ret = 0; - if (hdev->reset_type == HNAE3_VF_FULL_RESET && + if ((hdev->reset_type == HNAE3_VF_FULL_RESET || + hdev->reset_type == HNAE3_FLR_RESET) && test_bit(HCLGEVF_STATE_IRQ_INITED, &hdev->state)) { hclgevf_misc_irq_uninit(hdev); hclgevf_uninit_msi(hdev); -- cgit From 7d89b53cea1a702f97117fb4361523519bb1e52c Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Thu, 22 Dec 2022 14:43:42 +0800 Subject: net: hns3: fix miss L3E checking for rx packet For device supports RXD advanced layout, the driver will return directly if the hardware finish the checksum calculate. It cause missing L3E checking for ip packets. Fixes it. Fixes: 1ddc028ac849 ("net: hns3: refactor out RX completion checksum") Signed-off-by: Jian Shen Signed-off-by: Hao Lan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 0ec5730b1788..b4c4fb873568 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3855,18 +3855,16 @@ static int hns3_gro_complete(struct sk_buff *skb, u32 l234info) return 0; } -static bool hns3_checksum_complete(struct hns3_enet_ring *ring, +static void hns3_checksum_complete(struct hns3_enet_ring *ring, struct sk_buff *skb, u32 ptype, u16 csum) { if (ptype == HNS3_INVALID_PTYPE || hns3_rx_ptype_tbl[ptype].ip_summed != CHECKSUM_COMPLETE) - return false; + return; hns3_ring_stats_update(ring, csum_complete); skb->ip_summed = CHECKSUM_COMPLETE; skb->csum = csum_unfold((__force __sum16)csum); - - return true; } static void hns3_rx_handle_csum(struct sk_buff *skb, u32 l234info, @@ -3926,8 +3924,7 @@ static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb, ptype = hnae3_get_field(ol_info, HNS3_RXD_PTYPE_M, HNS3_RXD_PTYPE_S); - if (hns3_checksum_complete(ring, skb, ptype, csum)) - return; + hns3_checksum_complete(ring, skb, ptype, csum); /* check if hardware has done checksum */ if (!(bd_base_info & BIT(HNS3_RXD_L3L4P_B))) @@ -3936,6 +3933,7 @@ static void hns3_rx_checksum(struct hns3_enet_ring *ring, struct sk_buff *skb, if (unlikely(l234info & (BIT(HNS3_RXD_L3E_B) | BIT(HNS3_RXD_L4E_B) | BIT(HNS3_RXD_OL3E_B) | BIT(HNS3_RXD_OL4E_B)))) { + skb->ip_summed = CHECKSUM_NONE; hns3_ring_stats_update(ring, l3l4_csum_err); return; -- cgit From 8ee57c7b8406c7aa8ca31e014440c87c6383f429 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Thu, 22 Dec 2022 14:43:43 +0800 Subject: net: hns3: fix VF promisc mode not update when mac table full Currently, it missed set HCLGE_VPORT_STATE_PROMISC_CHANGE flag for VF when vport->overflow_promisc_flags changed. So the VF won't check whether to update promisc mode in this case. So add it. Fixes: 1e6e76101fd9 ("net: hns3: configure promisc mode for VF asynchronously") Signed-off-by: Jian Shen Signed-off-by: Hao Lan Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 75 +++++++++++++--------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 4e54f91f7a6c..6c2742f59c77 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12754,60 +12754,71 @@ static int hclge_gro_en(struct hnae3_handle *handle, bool enable) return ret; } -static void hclge_sync_promisc_mode(struct hclge_dev *hdev) +static int hclge_sync_vport_promisc_mode(struct hclge_vport *vport) { - struct hclge_vport *vport = &hdev->vport[0]; struct hnae3_handle *handle = &vport->nic; + struct hclge_dev *hdev = vport->back; + bool uc_en = false; + bool mc_en = false; u8 tmp_flags; + bool bc_en; int ret; - u16 i; if (vport->last_promisc_flags != vport->overflow_promisc_flags) { set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); vport->last_promisc_flags = vport->overflow_promisc_flags; } - if (test_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state)) { + if (!test_and_clear_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, + &vport->state)) + return 0; + + /* for PF */ + if (!vport->vport_id) { tmp_flags = handle->netdev_flags | vport->last_promisc_flags; ret = hclge_set_promisc_mode(handle, tmp_flags & HNAE3_UPE, tmp_flags & HNAE3_MPE); - if (!ret) { - clear_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, - &vport->state); + if (!ret) set_bit(HCLGE_VPORT_STATE_VLAN_FLTR_CHANGE, &vport->state); - } + else + set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, + &vport->state); + return ret; } - for (i = 1; i < hdev->num_alloc_vport; i++) { - bool uc_en = false; - bool mc_en = false; - bool bc_en; + /* for VF */ + if (vport->vf_info.trusted) { + uc_en = vport->vf_info.request_uc_en > 0 || + vport->overflow_promisc_flags & HNAE3_OVERFLOW_UPE; + mc_en = vport->vf_info.request_mc_en > 0 || + vport->overflow_promisc_flags & HNAE3_OVERFLOW_MPE; + } + bc_en = vport->vf_info.request_bc_en > 0; - vport = &hdev->vport[i]; + ret = hclge_cmd_set_promisc_mode(hdev, vport->vport_id, uc_en, + mc_en, bc_en); + if (ret) { + set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); + return ret; + } + hclge_set_vport_vlan_fltr_change(vport); - if (!test_and_clear_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, - &vport->state)) - continue; + return 0; +} - if (vport->vf_info.trusted) { - uc_en = vport->vf_info.request_uc_en > 0 || - vport->overflow_promisc_flags & - HNAE3_OVERFLOW_UPE; - mc_en = vport->vf_info.request_mc_en > 0 || - vport->overflow_promisc_flags & - HNAE3_OVERFLOW_MPE; - } - bc_en = vport->vf_info.request_bc_en > 0; +static void hclge_sync_promisc_mode(struct hclge_dev *hdev) +{ + struct hclge_vport *vport; + int ret; + u16 i; - ret = hclge_cmd_set_promisc_mode(hdev, vport->vport_id, uc_en, - mc_en, bc_en); - if (ret) { - set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, - &vport->state); + for (i = 0; i < hdev->num_alloc_vport; i++) { + vport = &hdev->vport[i]; + + ret = hclge_sync_vport_promisc_mode(vport); + if (ret) return; - } - hclge_set_vport_vlan_fltr_change(vport); } } -- cgit From fcbb408a1aaf426f88d8fb3b4c14e3625745b02f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 22 Dec 2022 13:39:58 -0800 Subject: selftests/bpf: Add host-tools to gitignore Shows up when cross-compiling: HOST_SCRATCH_DIR := $(OUTPUT)/host-tools vs SCRATCH_DIR := $(OUTPUT)/tools HOST_SCRATCH_DIR := $(SCRATCH_DIR) Reported-by: John Sperbeck Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221222213958.2302320-1-sdf@google.com --- tools/testing/selftests/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 07d2d0a8c5cb..401a75844cc0 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -36,6 +36,7 @@ test_cpp *.lskel.h /no_alu32 /bpf_gcc +/host-tools /tools /runqslower /bench -- cgit From 523dfa96add75e60cfe6bf5a1c8f713635cd6b73 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:34:49 +0100 Subject: drm/tests: reduce drm_mm_test stack usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check_reserve_boundaries function uses a lot of kernel stack, and it gets inlined by clang, which makes __drm_test_mm_reserve use even more of it, to the point of hitting the warning limit: drivers/gpu/drm/tests/drm_mm_test.c:344:12: error: stack frame size (1048) exceeds limit (1024) in '__drm_test_mm_reserve' [-Werror,-Wframe-larger-than] When building with gcc, this does not happen, but the structleak plugin can similarly increase the stack usage and needs to be disabled, as we do for all other kunit users. Signed-off-by: Arnd Bergmann Reviewed-by: Maíra Canal Reviewed-by: Nathan Chancellor Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20221215163511.266214-1-arnd@kernel.org --- drivers/gpu/drm/tests/Makefile | 2 ++ drivers/gpu/drm/tests/drm_mm_test.c | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/tests/Makefile b/drivers/gpu/drm/tests/Makefile index b29ef1085cad..f896ef85c2f2 100644 --- a/drivers/gpu/drm/tests/Makefile +++ b/drivers/gpu/drm/tests/Makefile @@ -12,3 +12,5 @@ obj-$(CONFIG_DRM_KUNIT_TEST) += \ drm_mm_test.o \ drm_plane_helper_test.o \ drm_rect_test.o + +CFLAGS_drm_mm_test.o := $(DISABLE_STRUCTLEAK_PLUGIN) diff --git a/drivers/gpu/drm/tests/drm_mm_test.c b/drivers/gpu/drm/tests/drm_mm_test.c index 89f12d3b4a21..186b28dc7038 100644 --- a/drivers/gpu/drm/tests/drm_mm_test.c +++ b/drivers/gpu/drm/tests/drm_mm_test.c @@ -298,9 +298,9 @@ static bool expect_reserve_fail(struct kunit *test, struct drm_mm *mm, struct dr return false; } -static bool check_reserve_boundaries(struct kunit *test, struct drm_mm *mm, - unsigned int count, - u64 size) +static bool noinline_for_stack check_reserve_boundaries(struct kunit *test, struct drm_mm *mm, + unsigned int count, + u64 size) { const struct boundary { u64 start, size; -- cgit From 8508fa2e7472f673edbeedf1b1d2b7a6bb898ecc Mon Sep 17 00:00:00 2001 From: Artem Egorkine Date: Sun, 25 Dec 2022 12:57:27 +0200 Subject: ALSA: line6: correct midi status byte when receiving data from podxt A PODxt device sends 0xb2, 0xc2 or 0xf2 as a status byte for MIDI messages over USB that should otherwise have a 0xb0, 0xc0 or 0xf0 status byte. This is usually corrected by the driver on other OSes. This fixes MIDI sysex messages sent by PODxt. [ tiwai: fixed white spaces ] Signed-off-by: Artem Egorkine Cc: Link: https://lore.kernel.org/r/20221225105728.1153989-1-arteme@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/line6/driver.c | 3 ++- sound/usb/line6/midi.c | 3 ++- sound/usb/line6/midibuf.c | 25 +++++++++++++++++-------- sound/usb/line6/midibuf.h | 5 ++++- sound/usb/line6/pod.c | 3 ++- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index 59faa5a9a714..b67617b68e50 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -304,7 +304,8 @@ static void line6_data_received(struct urb *urb) for (;;) { done = line6_midibuf_read(mb, line6->buffer_message, - LINE6_MIDI_MESSAGE_MAXLEN); + LINE6_MIDI_MESSAGE_MAXLEN, + LINE6_MIDIBUF_READ_RX); if (done <= 0) break; diff --git a/sound/usb/line6/midi.c b/sound/usb/line6/midi.c index ba0e2b7e8fe1..d52355de2bbc 100644 --- a/sound/usb/line6/midi.c +++ b/sound/usb/line6/midi.c @@ -56,7 +56,8 @@ static void line6_midi_transmit(struct snd_rawmidi_substream *substream) for (;;) { done = line6_midibuf_read(mb, chunk, - LINE6_FALLBACK_MAXPACKETSIZE); + LINE6_FALLBACK_MAXPACKETSIZE, + LINE6_MIDIBUF_READ_TX); if (done == 0) break; diff --git a/sound/usb/line6/midibuf.c b/sound/usb/line6/midibuf.c index 6a70463f82c4..e7f830f7526c 100644 --- a/sound/usb/line6/midibuf.c +++ b/sound/usb/line6/midibuf.c @@ -9,6 +9,7 @@ #include "midibuf.h" + static int midibuf_message_length(unsigned char code) { int message_length; @@ -20,12 +21,7 @@ static int midibuf_message_length(unsigned char code) message_length = length[(code >> 4) - 8]; } else { - /* - Note that according to the MIDI specification 0xf2 is - the "Song Position Pointer", but this is used by Line 6 - to send sysex messages to the host. - */ - static const int length[] = { -1, 2, -1, 2, -1, -1, 1, 1, 1, 1, + static const int length[] = { -1, 2, 2, 2, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1 }; message_length = length[code & 0x0f]; @@ -125,7 +121,7 @@ int line6_midibuf_write(struct midi_buffer *this, unsigned char *data, } int line6_midibuf_read(struct midi_buffer *this, unsigned char *data, - int length) + int length, int read_type) { int bytes_used; int length1, length2; @@ -148,9 +144,22 @@ int line6_midibuf_read(struct midi_buffer *this, unsigned char *data, length1 = this->size - this->pos_read; - /* check MIDI command length */ command = this->buf[this->pos_read]; + /* + PODxt always has status byte lower nibble set to 0010, + when it means to send 0000, so we correct if here so + that control/program changes come on channel 1 and + sysex message status byte is correct + */ + if (read_type == LINE6_MIDIBUF_READ_RX) { + if (command == 0xb2 || command == 0xc2 || command == 0xf2) { + unsigned char fixed = command & 0xf0; + this->buf[this->pos_read] = fixed; + command = fixed; + } + } + /* check MIDI command length */ if (command & 0x80) { midi_length = midibuf_message_length(command); this->command_prev = command; diff --git a/sound/usb/line6/midibuf.h b/sound/usb/line6/midibuf.h index 124a8f9f7e96..542e8d836f87 100644 --- a/sound/usb/line6/midibuf.h +++ b/sound/usb/line6/midibuf.h @@ -8,6 +8,9 @@ #ifndef MIDIBUF_H #define MIDIBUF_H +#define LINE6_MIDIBUF_READ_TX 0 +#define LINE6_MIDIBUF_READ_RX 1 + struct midi_buffer { unsigned char *buf; int size; @@ -23,7 +26,7 @@ extern void line6_midibuf_destroy(struct midi_buffer *mb); extern int line6_midibuf_ignore(struct midi_buffer *mb, int length); extern int line6_midibuf_init(struct midi_buffer *mb, int size, int split); extern int line6_midibuf_read(struct midi_buffer *mb, unsigned char *data, - int length); + int length, int read_type); extern void line6_midibuf_reset(struct midi_buffer *mb); extern int line6_midibuf_write(struct midi_buffer *mb, unsigned char *data, int length); diff --git a/sound/usb/line6/pod.c b/sound/usb/line6/pod.c index cd41aa7f0385..d173971e5f02 100644 --- a/sound/usb/line6/pod.c +++ b/sound/usb/line6/pod.c @@ -159,8 +159,9 @@ static struct line6_pcm_properties pod_pcm_properties = { .bytes_per_channel = 3 /* SNDRV_PCM_FMTBIT_S24_3LE */ }; + static const char pod_version_header[] = { - 0xf2, 0x7e, 0x7f, 0x06, 0x02 + 0xf0, 0x7e, 0x7f, 0x06, 0x02 }; static char *pod_alloc_sysex_buffer(struct usb_line6_pod *pod, int code, -- cgit From b8800d324abb50160560c636bfafe2c81001b66c Mon Sep 17 00:00:00 2001 From: Artem Egorkine Date: Sun, 25 Dec 2022 12:57:28 +0200 Subject: ALSA: line6: fix stack overflow in line6_midi_transmit Correctly calculate available space including the size of the chunk buffer. This fixes a buffer overflow when multiple MIDI sysex messages are sent to a PODxt device. Signed-off-by: Artem Egorkine Cc: Link: https://lore.kernel.org/r/20221225105728.1153989-2-arteme@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/line6/midi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/usb/line6/midi.c b/sound/usb/line6/midi.c index d52355de2bbc..0838632c788e 100644 --- a/sound/usb/line6/midi.c +++ b/sound/usb/line6/midi.c @@ -44,7 +44,8 @@ static void line6_midi_transmit(struct snd_rawmidi_substream *substream) int req, done; for (;;) { - req = min(line6_midibuf_bytes_free(mb), line6->max_packet_size); + req = min3(line6_midibuf_bytes_free(mb), line6->max_packet_size, + LINE6_FALLBACK_MAXPACKETSIZE); done = snd_rawmidi_transmit_peek(substream, chunk, req); if (done == 0) -- cgit From 399ab7fe0fa0d846881685fd4e57e9a8ef7559f7 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Thu, 22 Dec 2022 11:51:19 +0800 Subject: net: sched: fix memory leak in tcindex_set_parms Syzkaller reports a memory leak as follows: ==================================== BUG: memory leak unreferenced object 0xffff88810c287f00 (size 256): comm "syz-executor105", pid 3600, jiffies 4294943292 (age 12.990s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmalloc_trace+0x20/0x90 mm/slab_common.c:1046 [] kmalloc include/linux/slab.h:576 [inline] [] kmalloc_array include/linux/slab.h:627 [inline] [] kcalloc include/linux/slab.h:659 [inline] [] tcf_exts_init include/net/pkt_cls.h:250 [inline] [] tcindex_set_parms+0xa7/0xbe0 net/sched/cls_tcindex.c:342 [] tcindex_change+0xdf/0x120 net/sched/cls_tcindex.c:553 [] tc_new_tfilter+0x4f2/0x1100 net/sched/cls_api.c:2147 [] rtnetlink_rcv_msg+0x4dc/0x5d0 net/core/rtnetlink.c:6082 [] netlink_rcv_skb+0x87/0x1d0 net/netlink/af_netlink.c:2540 [] netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] [] netlink_unicast+0x397/0x4c0 net/netlink/af_netlink.c:1345 [] netlink_sendmsg+0x396/0x710 net/netlink/af_netlink.c:1921 [] sock_sendmsg_nosec net/socket.c:714 [inline] [] sock_sendmsg+0x56/0x80 net/socket.c:734 [] ____sys_sendmsg+0x178/0x410 net/socket.c:2482 [] ___sys_sendmsg+0xa8/0x110 net/socket.c:2536 [] __sys_sendmmsg+0x105/0x330 net/socket.c:2622 [] __do_sys_sendmmsg net/socket.c:2651 [inline] [] __se_sys_sendmmsg net/socket.c:2648 [inline] [] __x64_sys_sendmmsg+0x24/0x30 net/socket.c:2648 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd ==================================== Kernel uses tcindex_change() to change an existing filter properties. Yet the problem is that, during the process of changing, if `old_r` is retrieved from `p->perfect`, then kernel uses tcindex_alloc_perfect_hash() to newly allocate filter results, uses tcindex_filter_result_init() to clear the old filter result, without destroying its tcf_exts structure, which triggers the above memory leak. To be more specific, there are only two source for the `old_r`, according to the tcindex_lookup(). `old_r` is retrieved from `p->perfect`, or `old_r` is retrieved from `p->h`. * If `old_r` is retrieved from `p->perfect`, kernel uses tcindex_alloc_perfect_hash() to newly allocate the filter results. Then `r` is assigned with `cp->perfect + handle`, which is newly allocated. So condition `old_r && old_r != r` is true in this situation, and kernel uses tcindex_filter_result_init() to clear the old filter result, without destroying its tcf_exts structure * If `old_r` is retrieved from `p->h`, then `p->perfect` is NULL according to the tcindex_lookup(). Considering that `cp->h` is directly copied from `p->h` and `p->perfect` is NULL, `r` is assigned with `tcindex_lookup(cp, handle)`, whose value should be the same as `old_r`, so condition `old_r && old_r != r` is false in this situation, kernel ignores using tcindex_filter_result_init() to clear the old filter result. So only when `old_r` is retrieved from `p->perfect` does kernel use tcindex_filter_result_init() to clear the old filter result, which triggers the above memory leak. Considering that there already exists a tc_filter_wq workqueue to destroy the old tcindex_data by tcindex_partial_destroy_work() at the end of tcindex_set_parms(), this patch solves this memory leak bug by removing this old filter result clearing part and delegating it to the tc_filter_wq workqueue. Note that this patch doesn't introduce any other issues. If `old_r` is retrieved from `p->perfect`, this patch just delegates old filter result clearing part to the tc_filter_wq workqueue; If `old_r` is retrieved from `p->h`, kernel doesn't reach the old filter result clearing part, so removing this part has no effect. [Thanks to the suggestion from Jakub Kicinski, Cong Wang, Paolo Abeni and Dmitry Vyukov] Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Link: https://lore.kernel.org/all/0000000000001de5c505ebc9ec59@google.com/ Reported-by: syzbot+232ebdbd36706c965ebf@syzkaller.appspotmail.com Tested-by: syzbot+232ebdbd36706c965ebf@syzkaller.appspotmail.com Cc: Cong Wang Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Dmitry Vyukov Acked-by: Paolo Abeni Signed-off-by: Hawkins Jiawei Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index eb0e9458e722..ee2a050c887b 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -333,7 +333,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter_result *r, struct nlattr **tb, struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { - struct tcindex_filter_result new_filter_result, *old_r = r; + struct tcindex_filter_result new_filter_result; struct tcindex_data *cp = NULL, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_result cr = {}; @@ -402,7 +402,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcindex_filter_result_init(&new_filter_result, cp, net); if (err < 0) goto errout_alloc; - if (old_r) + if (r) cr = r->res; err = -EBUSY; @@ -479,14 +479,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, tcf_bind_filter(tp, &cr, base); } - if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - oldp = p; r->res = cr; tcf_exts_change(&r->exts, &e); -- cgit From 13a7c8964afcd8ca43c0b6001ebb0127baa95362 Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Thu, 22 Dec 2022 14:52:28 +0300 Subject: qlcnic: prevent ->dcb use-after-free on qlcnic_dcb_enable() failure adapter->dcb would get silently freed inside qlcnic_dcb_enable() in case qlcnic_dcb_attach() would return an error, which always happens under OOM conditions. This would lead to use-after-free because both of the existing callers invoke qlcnic_dcb_get_info() on the obtained pointer, which is potentially freed at that point. Propagate errors from qlcnic_dcb_enable(), and instead free the dcb pointer at callsite using qlcnic_dcb_free(). This also removes the now unused qlcnic_clear_dcb_ops() helper, which was a simple wrapper around kfree() also causing memory leaks for partially initialized dcb. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Fixes: 3c44bba1d270 ("qlcnic: Disable DCB operations from SR-IOV VFs") Reviewed-by: Michal Swiatkowski Signed-off-by: Daniil Tatianin Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c | 8 +++++++- drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h | 10 ++-------- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 8 +++++++- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c index dbb800769cb6..c95d56e56c59 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c @@ -2505,7 +2505,13 @@ int qlcnic_83xx_init(struct qlcnic_adapter *adapter) goto disable_mbx_intr; qlcnic_83xx_clear_function_resources(adapter); - qlcnic_dcb_enable(adapter->dcb); + + err = qlcnic_dcb_enable(adapter->dcb); + if (err) { + qlcnic_dcb_free(adapter->dcb); + goto disable_mbx_intr; + } + qlcnic_83xx_initialize_nic(adapter, 1); qlcnic_dcb_get_info(adapter->dcb); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h index 7519773eaca6..22afa2be85fd 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h @@ -41,11 +41,6 @@ struct qlcnic_dcb { unsigned long state; }; -static inline void qlcnic_clear_dcb_ops(struct qlcnic_dcb *dcb) -{ - kfree(dcb); -} - static inline int qlcnic_dcb_get_hw_capability(struct qlcnic_dcb *dcb) { if (dcb && dcb->ops->get_hw_capability) @@ -112,9 +107,8 @@ static inline void qlcnic_dcb_init_dcbnl_ops(struct qlcnic_dcb *dcb) dcb->ops->init_dcbnl_ops(dcb); } -static inline void qlcnic_dcb_enable(struct qlcnic_dcb *dcb) +static inline int qlcnic_dcb_enable(struct qlcnic_dcb *dcb) { - if (dcb && qlcnic_dcb_attach(dcb)) - qlcnic_clear_dcb_ops(dcb); + return dcb ? qlcnic_dcb_attach(dcb) : 0; } #endif diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 28476b982bab..44dac3c0908e 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -2599,7 +2599,13 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) "Device does not support MSI interrupts\n"); if (qlcnic_82xx_check(adapter)) { - qlcnic_dcb_enable(adapter->dcb); + err = qlcnic_dcb_enable(adapter->dcb); + if (err) { + qlcnic_dcb_free(adapter->dcb); + dev_err(&pdev->dev, "Failed to enable DCB\n"); + goto err_out_free_hw; + } + qlcnic_dcb_get_info(adapter->dcb); err = qlcnic_setup_intr(adapter); -- cgit From 30e725537546248bddc12eaac2fe0a258917f190 Mon Sep 17 00:00:00 2001 From: "Johnny S. Lee" Date: Thu, 22 Dec 2022 22:34:05 +0800 Subject: net: dsa: mv88e6xxx: depend on PTP conditionally PTP hardware timestamping related objects are not linked when PTP support for MV88E6xxx (NET_DSA_MV88E6XXX_PTP) is disabled, therefore NET_DSA_MV88E6XXX should not depend on PTP_1588_CLOCK_OPTIONAL regardless of NET_DSA_MV88E6XXX_PTP. Instead, condition more strictly on how NET_DSA_MV88E6XXX_PTP's dependencies are met, making sure that it cannot be enabled when NET_DSA_MV88E6XXX=y and PTP_1588_CLOCK=m. In other words, this commit allows NET_DSA_MV88E6XXX to be built-in while PTP_1588_CLOCK is a module, as long as NET_DSA_MV88E6XXX_PTP is prevented from being enabled. Fixes: e5f31552674e ("ethernet: fix PTP_1588_CLOCK dependencies") Signed-off-by: Johnny S. Lee Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/Kconfig b/drivers/net/dsa/mv88e6xxx/Kconfig index 7a2445a34eb7..e3181d5471df 100644 --- a/drivers/net/dsa/mv88e6xxx/Kconfig +++ b/drivers/net/dsa/mv88e6xxx/Kconfig @@ -2,7 +2,6 @@ config NET_DSA_MV88E6XXX tristate "Marvell 88E6xxx Ethernet switch fabric support" depends on NET_DSA - depends on PTP_1588_CLOCK_OPTIONAL select IRQ_DOMAIN select NET_DSA_TAG_EDSA select NET_DSA_TAG_DSA @@ -13,7 +12,8 @@ config NET_DSA_MV88E6XXX config NET_DSA_MV88E6XXX_PTP bool "PTP support for Marvell 88E6xxx" default n - depends on NET_DSA_MV88E6XXX && PTP_1588_CLOCK + depends on (NET_DSA_MV88E6XXX = y && PTP_1588_CLOCK = y) || \ + (NET_DSA_MV88E6XXX = m && PTP_1588_CLOCK) help Say Y to enable PTP hardware timestamping on Marvell 88E6xxx switch chips that support it. -- cgit From df49908f3c52d211aea5e2a14a93bbe67a2cb3af Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 23 Dec 2022 11:37:18 +0400 Subject: nfc: Fix potential resource leaks nfc_get_device() take reference for the device, add missing nfc_put_device() to release it when not need anymore. Also fix the style warnning by use error EOPNOTSUPP instead of ENOTSUPP. Fixes: 5ce3f32b5264 ("NFC: netlink: SE API implementation") Fixes: 29e76924cf08 ("nfc: netlink: Add capability to reply to vendor_cmd with data") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- net/nfc/netlink.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 9d91087b9399..1fc339084d89 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1497,6 +1497,7 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; + int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || @@ -1510,25 +1511,37 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - if (!dev->ops || !dev->ops->se_io) - return -ENOTSUPP; + if (!dev->ops || !dev->ops->se_io) { + rc = -EOPNOTSUPP; + goto put_dev; + } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); - if (apdu_len == 0) - return -EINVAL; + if (apdu_len == 0) { + rc = -EINVAL; + goto put_dev; + } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); - if (!apdu) - return -EINVAL; + if (!apdu) { + rc = -EINVAL; + goto put_dev; + } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + rc = -ENOMEM; + goto put_dev; + } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; - return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); + +put_dev: + nfc_put_device(dev); + return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, @@ -1551,14 +1564,21 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); - if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + if (!dev) return -ENODEV; + if (!dev->vendor_cmds || !dev->n_vendor_cmds) { + err = -ENODEV; + goto put_dev; + } + if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); - if (data_len == 0) - return -EINVAL; + if (data_len == 0) { + err = -EINVAL; + goto put_dev; + } } else { data = NULL; data_len = 0; @@ -1573,10 +1593,14 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; - return err; + goto put_dev; } - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + +put_dev: + nfc_put_device(dev); + return err; } /* message building helper */ -- cgit From d3805695fe1e7383517903715cefc9bbdcffdc90 Mon Sep 17 00:00:00 2001 From: Anuradha Weeraman Date: Sun, 25 Dec 2022 23:12:22 +0530 Subject: net: ethernet: marvell: octeontx2: Fix uninitialized variable warning Fix for uninitialized variable warning. Addresses-Coverity: ("Uninitialized scalar variable") Signed-off-by: Anuradha Weeraman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c index fa8029a94068..eb25e458266c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c @@ -589,7 +589,7 @@ int rvu_mbox_handler_mcs_free_resources(struct rvu *rvu, u16 pcifunc = req->hdr.pcifunc; struct mcs_rsrc_map *map; struct mcs *mcs; - int rc; + int rc = 0; if (req->mcs_id >= rvu->mcs_blk_cnt) return MCS_AF_ERR_INVALID_MCSID; -- cgit From a4517c4f3423c7c448f2c359218f97c1173523a1 Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Mon, 26 Dec 2022 19:43:03 +0800 Subject: ALSA: hda/realtek: Apply dual codec fixup for Dell Latitude laptops The Dell Latiture 3340/3440/3540 laptops with Realtek ALC3204 have dual codecs and need the ALC1220_FIXUP_GB_DUAL_CODECS to fix the conflicts of Master controls. The existing headset mic fixup for Dell is also required to enable the jack sense and the headset mic. Introduce a new fixup to fix the dual codec and headset mic issues for particular Dell laptops since other old Dell laptops with the same codec configuration are already well handled by the fixup in alc269_fallback_pin_fixup_tbl[]. Signed-off-by: Chris Chiu Cc: Link: https://lore.kernel.org/r/20221226114303.4027500-1-chris.chiu@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e443d88f627f..3794b522c222 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7175,6 +7175,7 @@ enum { ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN, ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS, + ALC236_FIXUP_DELL_DUAL_CODECS, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9130,6 +9131,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, }, + [ALC236_FIXUP_DELL_DUAL_CODECS] = { + .type = HDA_FIXUP_PINS, + .v.func = alc1220_fixup_gb_dual_codecs, + .chained = true, + .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -9232,6 +9239,12 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0b1a, "Dell Precision 5570", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x0b37, "Dell Inspiron 16 Plus 7620 2-in-1", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), SND_PCI_QUIRK(0x1028, 0x0b71, "Dell Inspiron 16 Plus 7620", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), + SND_PCI_QUIRK(0x1028, 0x0c19, "Dell Precision 3340", ALC236_FIXUP_DELL_DUAL_CODECS), + SND_PCI_QUIRK(0x1028, 0x0c1a, "Dell Precision 3340", ALC236_FIXUP_DELL_DUAL_CODECS), + SND_PCI_QUIRK(0x1028, 0x0c1b, "Dell Precision 3440", ALC236_FIXUP_DELL_DUAL_CODECS), + SND_PCI_QUIRK(0x1028, 0x0c1c, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), + SND_PCI_QUIRK(0x1028, 0x0c1d, "Dell Precision 3440", ALC236_FIXUP_DELL_DUAL_CODECS), + SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), -- cgit From 246cf66e300b76099b5dbd3fdd39e9a5dbc53f02 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Dec 2022 11:06:05 +0800 Subject: block, bfq: fix uaf for bfqq in bfq_exit_icq_bfqq Commit 64dc8c732f5c ("block, bfq: fix possible uaf for 'bfqq->bic'") will access 'bic->bfqq' in bic_set_bfqq(), however, bfq_exit_icq_bfqq() can free bfqq first, and then call bic_set_bfqq(), which will cause uaf. Fix the problem by moving bfq_exit_bfqq() behind bic_set_bfqq(). Fixes: 64dc8c732f5c ("block, bfq: fix possible uaf for 'bfqq->bic'") Reported-by: Yi Zhang Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20221226030605.1437081-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 16f43bbc575a..ccf2204477a5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5317,8 +5317,8 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); + bfq_exit_bfqq(bfqd, bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } } -- cgit From 33b93727ce90c8db916fb071ed13e90106339754 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 25 Dec 2022 11:32:31 +0100 Subject: nvme: fix setting the queue depth in nvme_alloc_io_tag_set While the CAP.MQES field in NVMe is a 0s based filed with a natural one off, we also need to account for the queue wrap condition and fix undo the one off again in nvme_alloc_io_tag_set. This was never properly done by the fabrics drivers, but they don't seem to care because there is no actual physical queue that can wrap around, but it became a problem when converting over the PCIe driver. Also add back the BLK_MQ_MAX_DEPTH check that was lost in the same commit. Fixes: 0da7feaa5913 ("nvme-pci: use the tagset alloc/free helpers") Reported-by: Hugh Dickins Signed-off-by: Christoph Hellwig Tested-by: Hugh Dickins Link: https://lore.kernel.org/r/20221225103234.226794-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e26b085a007a..cda1361e6d4f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4897,7 +4897,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; - set->queue_depth = ctrl->sqsize + 1; + set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1); /* * Some Apple controllers requires tags to be unique across admin and * the (only) I/O queue, so reserve the first 32 tags of the I/O queue. -- cgit From 88d356ca41ba1c3effc2d4208dfbd4392f58cd6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 25 Dec 2022 11:32:32 +0100 Subject: nvme-pci: update sqsize when adjusting the queue depth Update the core sqsize field in addition to the PCIe-specific q_depth field as the core tagset allocation helpers rely on it. Fixes: 0da7feaa5913 ("nvme-pci: use the tagset alloc/free helpers") Signed-off-by: Christoph Hellwig Acked-by: Hugh Dickins Link: https://lore.kernel.org/r/20221225103234.226794-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 804b6a6cb43a..b13baccedb4a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2333,10 +2333,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->cmb_use_sqes) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); - if (result > 0) + if (result > 0) { dev->q_depth = result; - else + dev->ctrl.sqsize = result - 1; + } else { dev->cmb_use_sqes = false; + } } do { @@ -2537,7 +2539,6 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); - dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; @@ -2578,7 +2579,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", dev->q_depth); } - + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ nvme_map_cmb(dev); -- cgit From 93ef83050e597634d2c7dc838a28caf5137b9404 Mon Sep 17 00:00:00 2001 From: "YoungJun.park" Date: Fri, 28 Oct 2022 07:42:41 -0700 Subject: kunit: alloc_string_stream_fragment error handling bug fix When it fails to allocate fragment, it does not free and return error. And check the pointer inappropriately. Fixed merge conflicts with commit 618887768bb7 ("kunit: update NULL vs IS_ERR() tests") Shuah Khan Signed-off-by: YoungJun.park Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/string-stream.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/kunit/string-stream.c b/lib/kunit/string-stream.c index f5f51166d8c2..cc32743c1171 100644 --- a/lib/kunit/string-stream.c +++ b/lib/kunit/string-stream.c @@ -23,8 +23,10 @@ static struct string_stream_fragment *alloc_string_stream_fragment( return ERR_PTR(-ENOMEM); frag->fragment = kunit_kmalloc(test, len, gfp); - if (!frag->fragment) + if (!frag->fragment) { + kunit_kfree(test, frag); return ERR_PTR(-ENOMEM); + } return frag; } -- cgit From 37e14e4f3715428b809e4df9a9958baa64c77d51 Mon Sep 17 00:00:00 2001 From: Adam Vodopjan Date: Fri, 9 Dec 2022 09:26:34 +0000 Subject: ata: ahci: Fix PCS quirk application for suspend Since kernel 5.3.4 my laptop (ICH8M controller) does not see Kingston SV300S37A60G SSD disk connected into a SATA connector on wake from suspend. The problem was introduced in c312ef176399 ("libata/ahci: Drop PCS quirk for Denverton and beyond"): the quirk is not applied on wake from suspend as it originally was. It is worth to mention the commit contained another bug: the quirk is not applied at all to controllers which require it. The fix commit 09d6ac8dc51a ("libata/ahci: Fix PCS quirk application") landed in 5.3.8. So testing my patch anywhere between commits c312ef176399 and 09d6ac8dc51a is pointless. Not all disks trigger the problem. For example nothing bad happens with Western Digital WD5000LPCX HDD. Test hardware: - Acer 5920G with ICH8M SATA controller - sda: some SATA HDD connnected into the DVD drive IDE port with a SATA-IDE caddy. It is a boot disk - sdb: Kingston SV300S37A60G SSD connected into the only SATA port Sample "dmesg --notime | grep -E '^(sd |ata)'" output on wake: sd 0:0:0:0: [sda] Starting disk sd 2:0:0:0: [sdb] Starting disk ata4: SATA link down (SStatus 4 SControl 300) ata3: SATA link down (SStatus 4 SControl 300) ata1.00: ACPI cmd ef/03:0c:00:00:00:a0 (SET FEATURES) filtered out ata1.00: ACPI cmd ef/03:42:00:00:00:a0 (SET FEATURES) filtered out ata1: FORCE: cable set to 80c ata5: SATA link down (SStatus 0 SControl 300) ata3: SATA link down (SStatus 4 SControl 300) ata3: SATA link down (SStatus 4 SControl 300) ata3.00: disabled sd 2:0:0:0: rejecting I/O to offline device ata3.00: detaching (SCSI 2:0:0:0) sd 2:0:0:0: [sdb] Start/Stop Unit failed: Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK sd 2:0:0:0: [sdb] Synchronizing SCSI cache sd 2:0:0:0: [sdb] Synchronize Cache(10) failed: Result: hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK sd 2:0:0:0: [sdb] Stopping disk sd 2:0:0:0: [sdb] Start/Stop Unit failed: Result: hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK Commit c312ef176399 dropped ahci_pci_reset_controller() which internally calls ahci_reset_controller() and applies the PCS quirk if needed after that. It was called each time a reset was required instead of just ahci_reset_controller(). This patch puts the function back in place. Fixes: c312ef176399 ("libata/ahci: Drop PCS quirk for Denverton and beyond") Signed-off-by: Adam Vodopjan Signed-off-by: Damien Le Moal --- drivers/ata/ahci.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 0cfd0ec6229b..14a1c0d14916 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -83,6 +83,7 @@ enum board_ids { static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent); static void ahci_remove_one(struct pci_dev *dev); static void ahci_shutdown_one(struct pci_dev *dev); +static void ahci_intel_pcs_quirk(struct pci_dev *pdev, struct ahci_host_priv *hpriv); static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class, unsigned long deadline); static int ahci_avn_hardreset(struct ata_link *link, unsigned int *class, @@ -676,6 +677,25 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev, ahci_save_initial_config(&pdev->dev, hpriv); } +static int ahci_pci_reset_controller(struct ata_host *host) +{ + struct pci_dev *pdev = to_pci_dev(host->dev); + struct ahci_host_priv *hpriv = host->private_data; + int rc; + + rc = ahci_reset_controller(host); + if (rc) + return rc; + + /* + * If platform firmware failed to enable ports, try to enable + * them here. + */ + ahci_intel_pcs_quirk(pdev, hpriv); + + return 0; +} + static void ahci_pci_init_controller(struct ata_host *host) { struct ahci_host_priv *hpriv = host->private_data; @@ -870,7 +890,7 @@ static int ahci_pci_device_runtime_resume(struct device *dev) struct ata_host *host = pci_get_drvdata(pdev); int rc; - rc = ahci_reset_controller(host); + rc = ahci_pci_reset_controller(host); if (rc) return rc; ahci_pci_init_controller(host); @@ -906,7 +926,7 @@ static int ahci_pci_device_resume(struct device *dev) ahci_mcp89_apple_enable(pdev); if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) { - rc = ahci_reset_controller(host); + rc = ahci_pci_reset_controller(host); if (rc) return rc; @@ -1784,12 +1804,6 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* save initial config */ ahci_pci_save_initial_config(pdev, hpriv); - /* - * If platform firmware failed to enable ports, try to enable - * them here. - */ - ahci_intel_pcs_quirk(pdev, hpriv); - /* prepare host */ if (hpriv->cap & HOST_CAP_NCQ) { pi.flags |= ATA_FLAG_NCQ; @@ -1899,7 +1913,7 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) return rc; - rc = ahci_reset_controller(host); + rc = ahci_pci_reset_controller(host); if (rc) return rc; -- cgit From e2d371484653ac83b970d3ebcf343383f39f8b6b Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 18 Nov 2022 10:45:39 +0530 Subject: perf core: Return error pointer if inherit_event() fails to find pmu_ctx inherit_event() returns NULL only when it finds orphaned events otherwise it returns either valid child_event pointer or an error pointer. Follow the same when it fails to find pmu_ctx. Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: Dan Carpenter Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221118051539.820-1-ravi.bangoria@amd.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index eacc3702654d..4bd2434251f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13231,7 +13231,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); - return NULL; + return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; -- cgit From f841b682baef90ee144df8b12e2c76aa460717c1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Dec 2022 20:40:23 +0800 Subject: perf/core: Fix cgroup events tracking We encounter perf warnings when using cgroup events like: cd /sys/fs/cgroup mkdir test perf stat -e cycles -a -G test Which then triggers: WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0 Call Trace: __schedule+0x4ae/0x9f0 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __cond_resched+0x18/0x20 preempt_schedule_common+0x2d/0x70 __cond_resched+0x18/0x20 wait_for_completion+0x2f/0x160 ? cpu_stop_queue_work+0x9e/0x130 affine_move_task+0x18a/0x4f0 WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0 Call Trace: ? ctx_sched_out+0xb7/0x1b0 perf_cgroup_switch+0x88/0xc0 __schedule+0x4ae/0x9f0 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __cond_resched+0x18/0x20 preempt_schedule_common+0x2d/0x70 __cond_resched+0x18/0x20 wait_for_completion+0x2f/0x160 ? cpu_stop_queue_work+0x9e/0x130 affine_move_task+0x18a/0x4f0 The above two warnings are not complete here since I remove other unimportant information. The problem is caused by the perf cgroup events tracking: CPU0 CPU1 perf_event_open() perf_event_alloc() account_event() account_event_cpu() atomic_inc(perf_cgroup_events) __perf_event_task_sched_out() if (atomic_read(perf_cgroup_events)) perf_cgroup_switch() // kernel/events/core.c:849 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0) if (READ_ONCE(cpuctx->cgrp) == cgrp) // false return perf_ctx_lock() ctx_sched_out() cpuctx->cgrp = cgrp ctx_sched_in() perf_cgroup_set_timestamp() // kernel/events/core.c:829 WARN_ON_ONCE(!ctx->nr_cgroups) perf_ctx_unlock() perf_install_in_context() cpu_function_call() __perf_install_in_context() add_event_to_ctx() list_add_event() perf_cgroup_event_enable() ctx->nr_cgroups++ cpuctx->cgrp = X We can see from above that we wrongly use percpu atomic perf_cgroup_events to check if we need to perf_cgroup_switch(), which should only be used when we know this CPU has cgroup events enabled. The commit bd2756811766 ("perf: Rewrite core context handling") change to have only one context per-CPU, so we can just use cpuctx->cgrp to check if this CPU has cgroup events enabled. So percpu atomic perf_cgroup_events is not needed. Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ravi Bangoria Link: https://lkml.kernel.org/r/20221207124023.66252-1-zhouchengming@bytedance.com --- kernel/events/core.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4bd2434251f0..37c0f04d7a00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -380,7 +380,6 @@ enum event_type_t { /* * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); @@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - cgrp = perf_cgroup_from_task(task, NULL); + /* + * cpuctx->cgrp is set when the first cgroup event enabled, + * and is cleared when the last cgroup event disabled. + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + + cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; @@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(next); + perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r) @@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event) detach_sb_event(event); } -static void unaccount_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_dec(&per_cpu(perf_cgroup_events, cpu)); -} - #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif @@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event) schedule_delayed_work(&perf_sched_work, HZ); } - unaccount_event_cpu(event, event->cpu); - unaccount_pmu_sb_event(event); } @@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event) attach_sb_event(event); } -static void account_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_inc(&per_cpu(perf_cgroup_events, cpu)); -} - /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { @@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event) } enabled: - account_event_cpu(event, event->cpu); - account_pmu_sb_event(event); } @@ -12822,13 +12804,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx, perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, cpu); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); - unaccount_event_cpu(sibling, cpu); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } @@ -12847,7 +12827,6 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, cpu); perf_install_in_context(ctx, event, cpu); } @@ -13742,8 +13721,7 @@ static int __perf_cgroup_move(void *info) struct task_struct *task = info; preempt_disable(); - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(task); + perf_cgroup_switch(task); preempt_enable(); return 0; -- cgit From 08245672cdc6505550d1a5020603b0a8d4a6dcc7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Dec 2022 13:51:49 +0000 Subject: perf/x86/amd: fix potential integer overflow on shift of a int The left shift of int 32 bit integer constant 1 is evaluated using 32 bit arithmetic and then passed as a 64 bit function argument. In the case where i is 32 or more this can lead to an overflow. Avoid this by shifting using the BIT_ULL macro instead. Fixes: 471af006a747 ("perf/x86/amd: Constrain Large Increment per Cycle events") Signed-off-by: Colin Ian King Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ian Rogers Acked-by: Kim Phillips Link: https://lore.kernel.org/r/20221202135149.1797974-1-colin.i.king@gmail.com --- arch/x86/events/amd/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d6f3703e4119..4386b10682ce 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1387,7 +1387,7 @@ static int __init amd_core_pmu_init(void) * numbered counter following it. */ for (i = 0; i < x86_pmu.num_counters - 1; i += 2) - even_ctr_mask |= 1 << i; + even_ctr_mask |= BIT_ULL(i); pair_constraint = (struct event_constraint) __EVENT_CONSTRAINT(0, even_ctr_mask, 0, -- cgit From a551844e345ba2a1c533dee4b55cb0efddb1bcda Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Dec 2022 15:40:04 +0100 Subject: perf: Fix use-after-free in error path The syscall error path has a use-after-free; put_pmu_ctx() will reference ctx, therefore we must ensure ctx is destroyed after pmu_ctx is. Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: syzbot+b8e8c01c8ade4fe6e48f@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra (Intel) Tested-by: Chengming Zhou Link: https://lkml.kernel.org/r/Y6B3xEgkbmFUCeni@hirez.programming.kicks-ass.net --- kernel/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 37c0f04d7a00..63d674c9b70e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12671,7 +12671,8 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: - /* event->pmu_ctx freed by free_event() */ + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); @@ -12784,6 +12785,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, err_pmu_ctx: put_pmu_ctx(pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); -- cgit From 0a041ebca4956292cadfb14a63ace3a9c1dcb0a3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Dec 2022 14:31:40 -0800 Subject: perf/core: Call LSM hook after copying perf_event_attr It passes the attr struct to the security_perf_event_open() but it's not initialized yet. Fixes: da97e18458fb ("perf_event: Add support for LSM and SELinux checks") Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221220223140.4020470-1-namhyung@kernel.org --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 63d674c9b70e..d56328e5080e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12321,12 +12321,12 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; - /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; - err = perf_copy_attr(attr_uptr, &attr); + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; -- cgit From ade8c20847fcc3f4de08b35f730ec04ef29ddbdc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:43:23 +0100 Subject: x86/calldepth: Fix incorrect init section references The addition of callthunks_translate_call_dest means that skip_addr() and patch_dest() can no longer be discarded as part of the __init section freeing: WARNING: modpost: vmlinux.o: section mismatch in reference: callthunks_translate_call_dest.cold (section: .text.unlikely) -> skip_addr (section: .init.text) WARNING: modpost: vmlinux.o: section mismatch in reference: callthunks_translate_call_dest.cold (section: .text.unlikely) -> patch_dest (section: .init.text) WARNING: modpost: vmlinux.o: section mismatch in reference: is_callthunk.cold (section: .text.unlikely) -> skip_addr (section: .init.text) ERROR: modpost: Section mismatches detected. Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them. Fixes: b2e9dfe54be4 ("x86/bpf: Emit call depth accounting if required") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221215164334.968863-1-arnd@kernel.org --- arch/x86/kernel/callthunks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 7d2c75ec9a8c..ffea98f9064b 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -119,7 +119,7 @@ static bool is_coretext(const struct core_text *ct, void *addr) return within_module_coretext(addr); } -static __init_or_module bool skip_addr(void *dest) +static bool skip_addr(void *dest) { if (dest == error_entry) return true; @@ -181,7 +181,7 @@ static const u8 nops[] = { 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, }; -static __init_or_module void *patch_dest(void *dest, bool direct) +static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; u8 *pad = dest - tsize; -- cgit From 1993bf97992df2d560287f3c4120eda57426843d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 19 Dec 2022 23:35:10 +0900 Subject: x86/kprobes: Fix kprobes instruction boudary check with CONFIG_RETHUNK Since the CONFIG_RETHUNK and CONFIG_SLS will use INT3 for stopping speculative execution after RET instruction, kprobes always failes to check the probed instruction boundary by decoding the function body if the probed address is after such sequence. (Note that some conditional code blocks will be placed after function return, if compiler decides it is not on the hot path.) This is because kprobes expects kgdb puts the INT3 as a software breakpoint and it will replace the original instruction. But these INT3 are not such purpose, it doesn't need to recover the original instruction. To avoid this issue, kprobes checks whether the INT3 is owned by kgdb or not, and if so, stop decoding and make it fail. The other INT3 will come from CONFIG_RETHUNK/CONFIG_SLS and those can be treated as a one-byte instruction. Fixes: e463a09af2f0 ("x86: Add straight-line-speculation mitigation") Suggested-by: Peter Zijlstra Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/167146051026.1374301.392728975473572291.stgit@devnote3 --- arch/x86/kernel/kprobes/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 66299682b6b7..b36f3c367cb2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr) if (ret < 0) return 0; +#ifdef CONFIG_KGDB /* - * Another debugging subsystem might insert this breakpoint. - * In that case, we can't recover it. + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) return 0; +#endif addr += insn.length; } -- cgit From 63dc6325ff41ee9e570bde705ac34a39c5dbeb44 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 19 Dec 2022 23:35:19 +0900 Subject: x86/kprobes: Fix optprobe optimization check with CONFIG_RETHUNK Since the CONFIG_RETHUNK and CONFIG_SLS will use INT3 for stopping speculative execution after function return, kprobe jump optimization always fails on the functions with such INT3 inside the function body. (It already checks the INT3 padding between functions, but not inside the function) To avoid this issue, as same as kprobes, check whether the INT3 comes from kgdb or not, and if so, stop decoding and make it fail. The other INT3 will come from CONFIG_RETHUNK/CONFIG_SLS and those can be treated as a one-byte instruction. Fixes: e463a09af2f0 ("x86: Add straight-line-speculation mitigation") Suggested-by: Peter Zijlstra Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/167146051929.1374301.7419382929328081706.stgit@devnote3 --- arch/x86/kernel/kprobes/opt.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index e6b8c5362b94..e57e07b0edb6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn) return ret; } -static bool is_padding_int3(unsigned long addr, unsigned long eaddr) -{ - unsigned char ops; - - for (; addr < eaddr; addr++) { - if (get_kernel_nofault(ops, (void *)addr) < 0 || - ops != INT3_INSN_OPCODE) - return false; - } - - return true; -} - /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr) ret = insn_decode_kernel(&insn, (void *)recovered_insn); if (ret < 0) return 0; - +#ifdef CONFIG_KGDB /* - * In the case of detecting unknown breakpoint, this could be - * a padding INT3 between functions. Let's check that all the - * rest of the bytes are also INT3. + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) - return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; - + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); -- cgit From 94cd8fa09f5f1ebdd4e90964b08b7f2cc4b36c43 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 14 Dec 2022 17:20:08 -0500 Subject: futex: Fix futex_waitv() hrtimer debug object leak on kcalloc error In a scenario where kcalloc() fails to allocate memory, the futex_waitv system call immediately returns -ENOMEM without invoking destroy_hrtimer_on_stack(). When CONFIG_DEBUG_OBJECTS_TIMERS=y, this results in leaking a timer debug object. Fixes: bf69bad38cf6 ("futex: Implement sys_futex_waitv()") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Cc: stable@vger.kernel.org Cc: stable@vger.kernel.org # v5.16+ Link: https://lore.kernel.org/r/20221214222008.200393-1-mathieu.desnoyers@efficios.com --- kernel/futex/syscalls.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 086a22d1adb7..a8074079b09e 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -286,19 +286,22 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, } futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); - if (!futexv) - return -ENOMEM; + if (!futexv) { + ret = -ENOMEM; + goto destroy_timer; + } ret = futex_parse_waitv(futexv, waiters, nr_futexes); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); + kfree(futexv); + +destroy_timer: if (timeout) { hrtimer_cancel(&to.timer); destroy_hrtimer_on_stack(&to.timer); } - - kfree(futexv); return ret; } -- cgit From 9eb803402a2a83400c6c6afd900e3b7c87c06816 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 16 Nov 2022 21:25:24 +0100 Subject: uapi:io_uring.h: allow linux/time_types.h to be skipped include/uapi/linux/io_uring.h is synced 1:1 into liburing:src/include/liburing/io_uring.h. liburing has a configure check to detect the need for linux/time_types.h. It can opt-out by defining UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H Fixes: 78a861b94959 ("io_uring: add sync cancelation API through io_uring_register()") Link: https://github.com/axboe/liburing/issues/708 Link: https://github.com/axboe/liburing/pull/709 Link: https://lore.kernel.org/io-uring/20221115212614.1308132-1-ammar.faizi@intel.com/T/#m9f5dd571cd4f6a5dee84452dbbca3b92ba7a4091 CC: Jens Axboe Cc: Ammar Faizi Signed-off-by: Stefan Metzmacher Reviewed-by: Ammar Faizi Link: https://lore.kernel.org/r/7071a0a1d751221538b20b63f9160094fc7e06f4.1668630247.git.metze@samba.org Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 9d4c4078e8d0..2780bce62faf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -10,7 +10,15 @@ #include #include +/* + * this file is shared with liburing and that has to autodetect + * if linux/time_types.h is available or not, it can + * define UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H + * if linux/time_types.h is not available + */ +#ifndef UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H #include +#endif #ifdef __cplusplus extern "C" { -- cgit From 9d8b5376cc2848ca22314fdec9a7a45b1bf69189 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 25 Nov 2022 16:04:01 -0800 Subject: fbdev: make offb driver tristate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the offb (Open Firmware frame buffer) driver tristate, i.e., so that it can be built as a loadable module. However, it still depends on the setting of DRM_OFDRM so that both of these drivers cannot be builtin at the same time nor can one be builtin and the other one a loadable module. Build-tested successfully with all combination of DRM_OFDRM and FB_OF. This fixes a build issue that Michal reported when FB_OF=y and DRM_OFDRM=m: powerpc64-linux-ld: drivers/video/fbdev/offb.o:(.data.rel.ro+0x58): undefined reference to `cfb_fillrect' powerpc64-linux-ld: drivers/video/fbdev/offb.o:(.data.rel.ro+0x60): undefined reference to `cfb_copyarea' powerpc64-linux-ld: drivers/video/fbdev/offb.o:(.data.rel.ro+0x68): undefined reference to `cfb_imageblit' Signed-off-by: Randy Dunlap Suggested-by: Arnd Bergmann Cc: Masahiro Yamada Cc: Thomas Zimmermann Cc: Michal Suchánek Cc: linuxppc-dev@lists.ozlabs.org Cc: Daniel Vetter Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Acked-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index df6e09f7d242..b2bed599e6c6 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -456,8 +456,8 @@ config FB_ATARI chipset found in Ataris. config FB_OF - bool "Open Firmware frame buffer device support" - depends on (FB = y) && PPC && (!PPC_PSERIES || PCI) + tristate "Open Firmware frame buffer device support" + depends on FB && PPC && (!PPC_PSERIES || PCI) depends on !DRM_OFDRM select APERTURE_HELPERS select FB_CFB_FILLRECT -- cgit From 8d8cf163c8d8c93bccf0c70a133309693af9bf61 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Wed, 28 Dec 2022 09:40:01 +0800 Subject: fbdev: omapfb: use strscpy() to instead of strncpy() The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL-terminated strings. Signed-off-by: Xu Panda Signed-off-by: Yang Yang Signed-off-by: Helge Deller --- drivers/video/fbdev/omap/omapfb_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/video/fbdev/omap/omapfb_main.c b/drivers/video/fbdev/omap/omapfb_main.c index 17cda5765683..1f3df2055ff0 100644 --- a/drivers/video/fbdev/omap/omapfb_main.c +++ b/drivers/video/fbdev/omap/omapfb_main.c @@ -1447,7 +1447,7 @@ static int fbinfo_init(struct omapfb_device *fbdev, struct fb_info *info) info->fbops = &omapfb_ops; info->flags = FBINFO_FLAG_DEFAULT; - strncpy(fix->id, MODULE_NAME, sizeof(fix->id)); + strscpy(fix->id, MODULE_NAME, sizeof(fix->id)); info->pseudo_palette = fbdev->pseudo_palette; @@ -1573,8 +1573,7 @@ static int omapfb_find_ctrl(struct omapfb_device *fbdev) fbdev->ctrl = NULL; - strncpy(name, conf->lcd.ctrl_name, sizeof(name) - 1); - name[sizeof(name) - 1] = '\0'; + strscpy(name, conf->lcd.ctrl_name, sizeof(name)); if (strcmp(name, "internal") == 0) { fbdev->ctrl = fbdev->int_ctrl; -- cgit From 6b90032c73405cd4da29ab914df11fd1be960b99 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Wed, 28 Dec 2022 09:44:11 +0800 Subject: fbdev: atyfb: use strscpy() to instead of strncpy() The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL-terminated strings. Signed-off-by: Xu Panda Signed-off-by: Yang Yang Signed-off-by: Helge Deller --- drivers/video/fbdev/aty/atyfb_base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index 0ccf5d401ecb..d59215a4992e 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -3192,8 +3192,7 @@ static void aty_init_lcd(struct atyfb_par *par, u32 bios_base) * which we print to the screen. */ id = *(u8 *)par->lcd_table; - strncpy(model, (char *)par->lcd_table+1, 24); - model[23] = 0; + strscpy(model, (char *)par->lcd_table+1, sizeof(model)); width = par->lcd_width = *(u16 *)(par->lcd_table+25); height = par->lcd_height = *(u16 *)(par->lcd_table+27); -- cgit From 0e50d999903c009b6a9cd2277c82d6798d982e31 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Dec 2022 14:49:00 +0000 Subject: rxrpc: Fix a couple of potential use-after-frees At the end of rxrpc_recvmsg(), if a call is found, the call is put and then a trace line is emitted referencing that call in a couple of places - but the call may have been deallocated by the time those traces happen. Fix this by stashing the call debug_id in a variable and passing that to the tracepoint rather than the call pointer. Fixes: 849979051cbc ("rxrpc: Add a tracepoint to follow what recvmsg does") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 6 +++--- net/rxrpc/recvmsg.c | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c6cfed00d0c6..5f9dd7389536 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1062,10 +1062,10 @@ TRACE_EVENT(rxrpc_receive, ); TRACE_EVENT(rxrpc_recvmsg, - TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, + TP_PROTO(unsigned int call_debug_id, enum rxrpc_recvmsg_trace why, int ret), - TP_ARGS(call, why, ret), + TP_ARGS(call_debug_id, why, ret), TP_STRUCT__entry( __field(unsigned int, call ) @@ -1074,7 +1074,7 @@ TRACE_EVENT(rxrpc_recvmsg, ), TP_fast_assign( - __entry->call = call ? call->debug_id : 0; + __entry->call = call_debug_id; __entry->why = why; __entry->ret = ret; ), diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 36b25d003cf0..6ebd6440a2b7 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -388,13 +388,14 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct list_head *l; + unsigned int call_debug_id = 0; size_t copied = 0; long timeo; int ret; DEFINE_WAIT(wait); - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0); + trace_rxrpc_recvmsg(0, rxrpc_recvmsg_enter, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; @@ -431,7 +432,7 @@ try_again: if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, 0); + trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); @@ -450,7 +451,8 @@ try_again: rxrpc_get_call(call, rxrpc_call_get_recvmsg); write_unlock(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0); + call_debug_id = call->debug_id; + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. @@ -531,7 +533,7 @@ try_again: error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_recvmsg); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; error_requeue_call: @@ -539,14 +541,14 @@ error_requeue_call: write_lock(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); write_unlock(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); } error_no_call: release_sock(&rx->sk); error_trace: - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); + trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; wait_interrupted: -- cgit From f4ef681115f822daf7f36f8b1892d9f1e1a26fbf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 22 Dec 2022 11:22:47 -0800 Subject: docs: netdev: reshuffle sections in prep for de-FAQization Subsequent changes will reformat the doc away from FAQ. To make that more readable perform the pure section moves now. Reviewed-by: Randy Dunlap Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/process/maintainer-netdev.rst | 186 ++++++++++++++-------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 1fa5ab8754d3..8f22f8a3dcd1 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -44,17 +44,6 @@ for the future release. You can find the trees here: - https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git - https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git -How do I indicate which tree (net vs. net-next) my patch should be in? ----------------------------------------------------------------------- -To help maintainers and CI bots you should explicitly mark which tree -your patch is targeting. Assuming that you use git, use the prefix -flag:: - - git format-patch --subject-prefix='PATCH net-next' start..finish - -Use ``net`` instead of ``net-next`` (always lower case) in the above for -bug-fix ``net`` content. - How often do changes from these trees make it to the mainline Linus tree? ------------------------------------------------------------------------- To understand this, you need to know a bit of background information on @@ -127,15 +116,6 @@ patch. Patches are indexed by the ``Message-ID`` header of the emails which carried them so if you have trouble finding your patch append the value of ``Message-ID`` to the URL above. -How long before my patch is accepted? -------------------------------------- -Generally speaking, the patches get triaged quickly (in less than -48h). But be patient, if your patch is active in patchwork (i.e. it's -listed on the project's patch list) the chances it was missed are close to zero. -Asking the maintainer for status updates on your -patch is a good way to ensure your patch is ignored or pushed to the -bottom of the priority list. - Should I directly update patchwork state of my own patches? ----------------------------------------------------------- It may be tempting to help the maintainers and update the state of your @@ -145,19 +125,14 @@ it to the maintainer to figure out what is the most recent and current version that should be applied. If there is any doubt, the maintainer will reply and ask what should be done. -How do I divide my work into patches? +How long before my patch is accepted? ------------------------------------- - -Put yourself in the shoes of the reviewer. Each patch is read separately -and therefore should constitute a comprehensible step towards your stated -goal. - -Avoid sending series longer than 15 patches. Larger series takes longer -to review as reviewers will defer looking at it until they find a large -chunk of time. A small series can be reviewed in a short time, so Maintainers -just do it. As a result, a sequence of smaller series gets merged quicker and -with better review coverage. Re-posting large series also increases the mailing -list traffic. +Generally speaking, the patches get triaged quickly (in less than +48h). But be patient, if your patch is active in patchwork (i.e. it's +listed on the project's patch list) the chances it was missed are close to zero. +Asking the maintainer for status updates on your +patch is a good way to ensure your patch is ignored or pushed to the +bottom of the priority list. I made changes to only a few patches in a patch series should I resend only those changed? ------------------------------------------------------------------------------------------ @@ -165,17 +140,6 @@ No, please resend the entire patch series and make sure you do number your patches such that it is clear this is the latest and greatest set of patches that can be applied. -I have received review feedback, when should I post a revised version of the patches? -------------------------------------------------------------------------------------- -Allow at least 24 hours to pass between postings. This will ensure reviewers -from all geographical locations have a chance to chime in. Do not wait -too long (weeks) between postings either as it will make it harder for reviewers -to recall all the context. - -Make sure you address all the feedback in your new posting. Do not post a new -version of the code if the discussion about the previous version is still -ongoing, unless directly instructed by a reviewer. - I submitted multiple versions of a patch series and it looks like a version other than the last one has been accepted, what should I do? ---------------------------------------------------------------------------------------------------------------------------------------- There is no revert possible, once it is pushed out, it stays like that. @@ -191,6 +155,82 @@ the case today. Please follow the standard stable rules in :ref:`Documentation/process/stable-kernel-rules.rst `, and make sure you include appropriate Fixes tags! +I found a bug that might have possible security implications or similar. Should I mail the main netdev maintainer off-list? +--------------------------------------------------------------------------------------------------------------------------- +No. The current netdev maintainer has consistently requested that +people use the mailing lists and not reach out directly. If you aren't +OK with that, then perhaps consider mailing security@kernel.org or +reading about http://oss-security.openwall.org/wiki/mailing-lists/distros +as possible alternative mechanisms. + +How do I post corresponding changes to user space components? +------------------------------------------------------------- +User space code exercising kernel features should be posted +alongside kernel patches. This gives reviewers a chance to see +how any new interface is used and how well it works. + +When user space tools reside in the kernel repo itself all changes +should generally come as one series. If series becomes too large +or the user space project is not reviewed on netdev include a link +to a public repo where user space patches can be seen. + +In case user space tooling lives in a separate repository but is +reviewed on netdev (e.g. patches to ``iproute2`` tools) kernel and +user space patches should form separate series (threads) when posted +to the mailing list, e.g.:: + + [PATCH net-next 0/3] net: some feature cover letter + └─ [PATCH net-next 1/3] net: some feature prep + └─ [PATCH net-next 2/3] net: some feature do it + └─ [PATCH net-next 3/3] selftest: net: some feature + + [PATCH iproute2-next] ip: add support for some feature + +Posting as one thread is discouraged because it confuses patchwork +(as of patchwork 2.2.2). + +Any other tips to help ensure my net/net-next patch gets OK'd? +-------------------------------------------------------------- +Attention to detail. Re-read your own work as if you were the +reviewer. You can start with using ``checkpatch.pl``, perhaps even with +the ``--strict`` flag. But do not be mindlessly robotic in doing so. +If your change is a bug fix, make sure your commit log indicates the +end-user visible symptom, the underlying reason as to why it happens, +and then if necessary, explain why the fix proposed is the best way to +get things done. Don't mangle whitespace, and as is common, don't +mis-indent function arguments that span multiple lines. If it is your +first patch, mail it to yourself so you can test apply it to an +unpatched tree to confirm infrastructure didn't mangle it. + +Finally, go back and read +:ref:`Documentation/process/submitting-patches.rst ` +to be sure you are not repeating some common mistake documented there. + +How do I indicate which tree (net vs. net-next) my patch should be in? +---------------------------------------------------------------------- +To help maintainers and CI bots you should explicitly mark which tree +your patch is targeting. Assuming that you use git, use the prefix +flag:: + + git format-patch --subject-prefix='PATCH net-next' start..finish + +Use ``net`` instead of ``net-next`` (always lower case) in the above for +bug-fix ``net`` content. + +How do I divide my work into patches? +------------------------------------- + +Put yourself in the shoes of the reviewer. Each patch is read separately +and therefore should constitute a comprehensible step towards your stated +goal. + +Avoid sending series longer than 15 patches. Larger series takes longer +to review as reviewers will defer looking at it until they find a large +chunk of time. A small series can be reviewed in a short time, so Maintainers +just do it. As a result, a sequence of smaller series gets merged quicker and +with better review coverage. Re-posting large series also increases the mailing +list traffic. + Is the comment style convention different for the networking content? --------------------------------------------------------------------- Yes, in a largely trivial way. Instead of this:: @@ -224,13 +264,16 @@ I am working in existing code which uses non-standard formatting. Which formatti Make your code follow the most recent guidelines, so that eventually all code in the domain of netdev is in the preferred format. -I found a bug that might have possible security implications or similar. Should I mail the main netdev maintainer off-list? ---------------------------------------------------------------------------------------------------------------------------- -No. The current netdev maintainer has consistently requested that -people use the mailing lists and not reach out directly. If you aren't -OK with that, then perhaps consider mailing security@kernel.org or -reading about http://oss-security.openwall.org/wiki/mailing-lists/distros -as possible alternative mechanisms. +I have received review feedback, when should I post a revised version of the patches? +------------------------------------------------------------------------------------- +Allow at least 24 hours to pass between postings. This will ensure reviewers +from all geographical locations have a chance to chime in. Do not wait +too long (weeks) between postings either as it will make it harder for reviewers +to recall all the context. + +Make sure you address all the feedback in your new posting. Do not post a new +version of the code if the discussion about the previous version is still +ongoing, unless directly instructed by a reviewer. What level of testing is expected before I submit my change? ------------------------------------------------------------ @@ -244,32 +287,6 @@ and the patch series contains a set of kernel selftest for You are expected to test your changes on top of the relevant networking tree (``net`` or ``net-next``) and not e.g. a stable tree or ``linux-next``. -How do I post corresponding changes to user space components? -------------------------------------------------------------- -User space code exercising kernel features should be posted -alongside kernel patches. This gives reviewers a chance to see -how any new interface is used and how well it works. - -When user space tools reside in the kernel repo itself all changes -should generally come as one series. If series becomes too large -or the user space project is not reviewed on netdev include a link -to a public repo where user space patches can be seen. - -In case user space tooling lives in a separate repository but is -reviewed on netdev (e.g. patches to ``iproute2`` tools) kernel and -user space patches should form separate series (threads) when posted -to the mailing list, e.g.:: - - [PATCH net-next 0/3] net: some feature cover letter - └─ [PATCH net-next 1/3] net: some feature prep - └─ [PATCH net-next 2/3] net: some feature do it - └─ [PATCH net-next 3/3] selftest: net: some feature - - [PATCH iproute2-next] ip: add support for some feature - -Posting as one thread is discouraged because it confuses patchwork -(as of patchwork 2.2.2). - Can I reproduce the checks from patchwork on my local machine? -------------------------------------------------------------- @@ -303,23 +320,6 @@ it has a real, in-tree user. Mock-ups and tests based on ``netdevsim`` are strongly encouraged when adding new APIs, but ``netdevsim`` in itself is **not** considered a use case/user. -Any other tips to help ensure my net/net-next patch gets OK'd? --------------------------------------------------------------- -Attention to detail. Re-read your own work as if you were the -reviewer. You can start with using ``checkpatch.pl``, perhaps even with -the ``--strict`` flag. But do not be mindlessly robotic in doing so. -If your change is a bug fix, make sure your commit log indicates the -end-user visible symptom, the underlying reason as to why it happens, -and then if necessary, explain why the fix proposed is the best way to -get things done. Don't mangle whitespace, and as is common, don't -mis-indent function arguments that span multiple lines. If it is your -first patch, mail it to yourself so you can test apply it to an -unpatched tree to confirm infrastructure didn't mangle it. - -Finally, go back and read -:ref:`Documentation/process/submitting-patches.rst ` -to be sure you are not repeating some common mistake documented there. - My company uses peer feedback in employee performance reviews. Can I ask netdev maintainers for feedback? --------------------------------------------------------------------------------------------------------- -- cgit From ff249be5cca9f982e58936847ba6c30104abbcad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 22 Dec 2022 11:22:48 -0800 Subject: docs: netdev: convert to a non-FAQ document The netdev-FAQ document has grown over the years to the point where finding information in it is somewhat challenging. The length of the questions prevents readers from locating content that's relevant at a glance. Convert to a more standard documentation format with sections and sub-sections rather than questions and answers. The content edits are limited to what's necessary to change the format, and very minor clarifications. Reviewed-by: Randy Dunlap Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/process/maintainer-netdev.rst | 221 ++++++++++++++++------------ 1 file changed, 125 insertions(+), 96 deletions(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 8f22f8a3dcd1..4a75686d35ab 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -2,9 +2,9 @@ .. _netdev-FAQ: -========== -netdev FAQ -========== +============================= +Networking subsystem (netdev) +============================= tl;dr ----- @@ -15,14 +15,15 @@ tl;dr - don't repost your patches within one 24h period - reverse xmas tree -What is netdev? ---------------- -It is a mailing list for all network-related Linux stuff. This +netdev +------ + +netdev is a mailing list for all network-related Linux stuff. This includes anything found under net/ (i.e. core code like IPv6) and drivers/net (i.e. hardware specific drivers) in the Linux source tree. Note that some subsystems (e.g. wireless drivers) which have a high -volume of traffic have their own specific mailing lists. +volume of traffic have their own specific mailing lists and trees. The netdev list is managed (like many other Linux mailing lists) through VGER (http://vger.kernel.org/) with archives available at @@ -32,21 +33,10 @@ Aside from subsystems like those mentioned above, all network-related Linux development (i.e. RFC, review, comments, etc.) takes place on netdev. -How do the changes posted to netdev make their way into Linux? --------------------------------------------------------------- -There are always two trees (git repositories) in play. Both are -driven by David Miller, the main network maintainer. There is the -``net`` tree, and the ``net-next`` tree. As you can probably guess from -the names, the ``net`` tree is for fixes to existing code already in the -mainline tree from Linus, and ``net-next`` is where the new code goes -for the future release. You can find the trees here: - -- https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git -- https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git +Development cycle +----------------- -How often do changes from these trees make it to the mainline Linus tree? -------------------------------------------------------------------------- -To understand this, you need to know a bit of background information on +Here is a bit of background information on the cadence of Linux development. Each new release starts off with a two week "merge window" where the main maintainers feed their new stuff to Linus for merging into the mainline tree. After the two weeks, the @@ -58,9 +48,33 @@ rc2 is released. This repeats on a roughly weekly basis until rc7 state of churn), and a week after the last vX.Y-rcN was done, the official vX.Y is released. -Relating that to netdev: At the beginning of the 2-week merge window, -the ``net-next`` tree will be closed - no new changes/features. The -accumulated new content of the past ~10 weeks will be passed onto +To find out where we are now in the cycle - load the mainline (Linus) +page here: + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git + +and note the top of the "tags" section. If it is rc1, it is early in +the dev cycle. If it was tagged rc7 a week ago, then a release is +probably imminent. If the most recent tag is a final release tag +(without an ``-rcN`` suffix) - we are most likely in a merge window +and ``net-next`` is closed. + +git trees and patch flow +------------------------ + +There are two networking trees (git repositories) in play. Both are +driven by David Miller, the main network maintainer. There is the +``net`` tree, and the ``net-next`` tree. As you can probably guess from +the names, the ``net`` tree is for fixes to existing code already in the +mainline tree from Linus, and ``net-next`` is where the new code goes +for the future release. You can find the trees here: + +- https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git +- https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git + +Relating that to kernel development: At the beginning of the 2-week +merge window, the ``net-next`` tree will be closed - no new changes/features. +The accumulated new content of the past ~10 weeks will be passed onto mainline/Linus via a pull request for vX.Y -- at the same time, the ``net`` tree will start accumulating fixes for this pulled content relating to vX.Y @@ -92,22 +106,14 @@ focus for ``net`` is on stabilization and bug fixes. Finally, the vX.Y gets released, and the whole cycle starts over. -So where are we now in this cycle? ----------------------------------- +netdev patch review +------------------- -Load the mainline (Linus) page here: +Patch status +~~~~~~~~~~~~ - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git - -and note the top of the "tags" section. If it is rc1, it is early in -the dev cycle. If it was tagged rc7 a week ago, then a release is -probably imminent. If the most recent tag is a final release tag -(without an ``-rcN`` suffix) - we are most likely in a merge window -and ``net-next`` is closed. - -How can I tell the status of a patch I've sent? ------------------------------------------------ -Start by looking at the main patchworks queue for netdev: +Status of a patch can be checked by looking at the main patchwork +queue for netdev: https://patchwork.kernel.org/project/netdevbpf/list/ @@ -116,17 +122,20 @@ patch. Patches are indexed by the ``Message-ID`` header of the emails which carried them so if you have trouble finding your patch append the value of ``Message-ID`` to the URL above. -Should I directly update patchwork state of my own patches? ------------------------------------------------------------ +Updating patch status +~~~~~~~~~~~~~~~~~~~~~ + It may be tempting to help the maintainers and update the state of your -own patches when you post a new version or spot a bug. Please do not do that. +own patches when you post a new version or spot a bug. Please **do not** +do that. Interfering with the patch status on patchwork will only cause confusion. Leave it to the maintainer to figure out what is the most recent and current version that should be applied. If there is any doubt, the maintainer will reply and ask what should be done. -How long before my patch is accepted? -------------------------------------- +Review timelines +~~~~~~~~~~~~~~~~ + Generally speaking, the patches get triaged quickly (in less than 48h). But be patient, if your patch is active in patchwork (i.e. it's listed on the project's patch list) the chances it was missed are close to zero. @@ -134,37 +143,47 @@ Asking the maintainer for status updates on your patch is a good way to ensure your patch is ignored or pushed to the bottom of the priority list. -I made changes to only a few patches in a patch series should I resend only those changed? ------------------------------------------------------------------------------------------- -No, please resend the entire patch series and make sure you do number your +Partial resends +~~~~~~~~~~~~~~~ + +Please always resend the entire patch series and make sure you do number your patches such that it is clear this is the latest and greatest set of patches -that can be applied. +that can be applied. Do not try to resend just the patches which changed. + +Handling misapplied patches +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -I submitted multiple versions of a patch series and it looks like a version other than the last one has been accepted, what should I do? ----------------------------------------------------------------------------------------------------------------------------------------- +Occasionally a patch series gets applied before receiving critical feedback, +or the wrong version of a series gets applied. There is no revert possible, once it is pushed out, it stays like that. Please send incremental versions on top of what has been merged in order to fix the patches the way they would look like if your latest patch series was to be merged. -Are there special rules regarding stable submissions on netdev? ---------------------------------------------------------------- +Stable tree +~~~~~~~~~~~ + While it used to be the case that netdev submissions were not supposed to carry explicit ``CC: stable@vger.kernel.org`` tags that is no longer the case today. Please follow the standard stable rules in :ref:`Documentation/process/stable-kernel-rules.rst `, and make sure you include appropriate Fixes tags! -I found a bug that might have possible security implications or similar. Should I mail the main netdev maintainer off-list? ---------------------------------------------------------------------------------------------------------------------------- -No. The current netdev maintainer has consistently requested that +Security fixes +~~~~~~~~~~~~~~ + +Do not email netdev maintainers directly if you think you discovered +a bug that might have possible security implications. +The current netdev maintainer has consistently requested that people use the mailing lists and not reach out directly. If you aren't OK with that, then perhaps consider mailing security@kernel.org or reading about http://oss-security.openwall.org/wiki/mailing-lists/distros as possible alternative mechanisms. -How do I post corresponding changes to user space components? -------------------------------------------------------------- + +Co-posting changes to user space components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + User space code exercising kernel features should be posted alongside kernel patches. This gives reviewers a chance to see how any new interface is used and how well it works. @@ -189,9 +208,10 @@ to the mailing list, e.g.:: Posting as one thread is discouraged because it confuses patchwork (as of patchwork 2.2.2). -Any other tips to help ensure my net/net-next patch gets OK'd? --------------------------------------------------------------- -Attention to detail. Re-read your own work as if you were the +Preparing changes +----------------- + +Attention to detail is important. Re-read your own work as if you were the reviewer. You can start with using ``checkpatch.pl``, perhaps even with the ``--strict`` flag. But do not be mindlessly robotic in doing so. If your change is a bug fix, make sure your commit log indicates the @@ -206,8 +226,9 @@ Finally, go back and read :ref:`Documentation/process/submitting-patches.rst ` to be sure you are not repeating some common mistake documented there. -How do I indicate which tree (net vs. net-next) my patch should be in? ----------------------------------------------------------------------- +Indicating target tree +~~~~~~~~~~~~~~~~~~~~~~ + To help maintainers and CI bots you should explicitly mark which tree your patch is targeting. Assuming that you use git, use the prefix flag:: @@ -217,8 +238,8 @@ flag:: Use ``net`` instead of ``net-next`` (always lower case) in the above for bug-fix ``net`` content. -How do I divide my work into patches? -------------------------------------- +Dividing work into patches +~~~~~~~~~~~~~~~~~~~~~~~~~~ Put yourself in the shoes of the reviewer. Each patch is read separately and therefore should constitute a comprehensible step towards your stated @@ -231,9 +252,11 @@ just do it. As a result, a sequence of smaller series gets merged quicker and with better review coverage. Re-posting large series also increases the mailing list traffic. -Is the comment style convention different for the networking content? ---------------------------------------------------------------------- -Yes, in a largely trivial way. Instead of this:: +Multi-line comments +~~~~~~~~~~~~~~~~~~~ + +Comment style convention is slightly different for networking and most of +the tree. Instead of this:: /* * foobar blah blah blah @@ -246,8 +269,8 @@ it is requested that you make it look like this:: * another line of text */ -What is "reverse xmas tree"? ----------------------------- +Local variable ordering ("reverse xmas tree", "RCS") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Netdev has a convention for ordering local variables in functions. Order the variable declaration lines longest to shortest, e.g.:: @@ -259,13 +282,16 @@ Order the variable declaration lines longest to shortest, e.g.:: If there are dependencies between the variables preventing the ordering move the initialization out of line. -I am working in existing code which uses non-standard formatting. Which formatting should I use? ------------------------------------------------------------------------------------------------- -Make your code follow the most recent guidelines, so that eventually all code +Format precedence +~~~~~~~~~~~~~~~~~ + +When working in existing code which uses nonstandard formatting make +your code follow the most recent guidelines, so that eventually all code in the domain of netdev is in the preferred format. -I have received review feedback, when should I post a revised version of the patches? -------------------------------------------------------------------------------------- +Resending after review +~~~~~~~~~~~~~~~~~~~~~~ + Allow at least 24 hours to pass between postings. This will ensure reviewers from all geographical locations have a chance to chime in. Do not wait too long (weeks) between postings either as it will make it harder for reviewers @@ -275,8 +301,12 @@ Make sure you address all the feedback in your new posting. Do not post a new version of the code if the discussion about the previous version is still ongoing, unless directly instructed by a reviewer. -What level of testing is expected before I submit my change? ------------------------------------------------------------- +Testing +------- + +Expected level of testing +~~~~~~~~~~~~~~~~~~~~~~~~~ + At the very minimum your changes must survive an ``allyesconfig`` and an ``allmodconfig`` build with ``W=1`` set without new warnings or failures. @@ -287,43 +317,42 @@ and the patch series contains a set of kernel selftest for You are expected to test your changes on top of the relevant networking tree (``net`` or ``net-next``) and not e.g. a stable tree or ``linux-next``. -Can I reproduce the checks from patchwork on my local machine? --------------------------------------------------------------- +patchwork checks +~~~~~~~~~~~~~~~~ Checks in patchwork are mostly simple wrappers around existing kernel scripts, the sources are available at: https://github.com/kuba-moo/nipa/tree/master/tests -Running all the builds and checks locally is a pain, can I post my patches and have the patchwork bot validate them? --------------------------------------------------------------------------------------------------------------------- - -No, you must ensure that your patches are ready by testing them locally +**Do not** post your patches just to run them through the checks. +You must ensure that your patches are ready by testing them locally before posting to the mailing list. The patchwork build bot instance gets overloaded very easily and netdev@vger really doesn't need more traffic if we can help it. -netdevsim is great, can I extend it for my out-of-tree tests? -------------------------------------------------------------- +netdevsim +~~~~~~~~~ -No, ``netdevsim`` is a test vehicle solely for upstream tests. -(Please add your tests under ``tools/testing/selftests/``.) +``netdevsim`` is a test driver which can be used to exercise driver +configuration APIs without requiring capable hardware. +Mock-ups and tests based on ``netdevsim`` are strongly encouraged when +adding new APIs, but ``netdevsim`` in itself is **not** considered +a use case/user. You must also implement the new APIs in a real driver. -We also give no guarantees that ``netdevsim`` won't change in the future +We give no guarantees that ``netdevsim`` won't change in the future in a way which would break what would normally be considered uAPI. -Is netdevsim considered a "user" of an API? -------------------------------------------- - -Linux kernel has a long standing rule that no API should be added unless -it has a real, in-tree user. Mock-ups and tests based on ``netdevsim`` are -strongly encouraged when adding new APIs, but ``netdevsim`` in itself -is **not** considered a use case/user. +``netdevsim`` is reserved for use by upstream tests only, so any +new ``netdevsim`` features must be accompanied by selftests under +``tools/testing/selftests/``. -My company uses peer feedback in employee performance reviews. Can I ask netdev maintainers for feedback? ---------------------------------------------------------------------------------------------------------- +Testimonials / feedback +----------------------- -Yes, especially if you spend significant amount of time reviewing code +Some companies use peer feedback in employee performance reviews. +Please feel free to request feedback from netdev maintainers, +especially if you spend significant amount of time reviewing code and go out of your way to improve shared infrastructure. The feedback must be requested by you, the contributor, and will always -- cgit From b9e05399d9273c8c066e73db1e6e85364003030c Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Mon, 10 Oct 2022 10:27:03 -0700 Subject: vdpa: merge functionally duplicated dev_features attributes We can merge VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES with VDPA_ATTR_DEV_FEATURES which is functionally equivalent. While at it, tweak the comment in header file to make user provioned device features distinguished from those supported by the parent mgmtdev device: the former of which can be inherited as a whole from the latter, or can be a subset of the latter if explicitly specified. Signed-off-by: Si-Wei Liu Message-Id: <1665422823-18364-1-git-send-email-si-wei.liu@oracle.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa.c | 2 +- include/uapi/linux/vdpa.h | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index febdc99b51a7..41ed56362992 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -855,7 +855,7 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms features_device = vdev->config->get_device_features(vdev); - if (nla_put_u64_64bit(msg, VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES, features_device, + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index 9bd79235c875..54b649ab0f22 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -53,11 +53,9 @@ enum vdpa_attr { VDPA_ATTR_DEV_VENDOR_ATTR_NAME, /* string */ VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, /* u64 */ + /* virtio features that are provisioned to the vDPA device */ VDPA_ATTR_DEV_FEATURES, /* u64 */ - /* virtio features that are supported by the vDPA device */ - VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES, /* u64 */ - /* new attributes must be added above here */ VDPA_ATTR_MAX, }; -- cgit From c262f75cb6bb5a63828e72ce3b8fe808e5029479 Mon Sep 17 00:00:00 2001 From: Ricardo Cañuelo Date: Wed, 12 Oct 2022 08:29:49 +0200 Subject: tools/virtio: initialize spinlocks in vring_test.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The virtio_device vqs_list spinlocks must be initialized before use to prevent functions that manipulate the device virtualqueues, such as vring_new_virtqueue(), from blocking indefinitely. Signed-off-by: Ricardo Cañuelo Message-Id: <20221012062949.1526176-1-ricardo.canuelo@collabora.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- tools/virtio/vringh_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c index fa87b58bd5fa..98ff808d6f0c 100644 --- a/tools/virtio/vringh_test.c +++ b/tools/virtio/vringh_test.c @@ -308,6 +308,7 @@ static int parallel_test(u64 features, gvdev.vdev.features = features; INIT_LIST_HEAD(&gvdev.vdev.vqs); + spin_lock_init(&gvdev.vdev.vqs_list_lock); gvdev.to_host_fd = to_host[1]; gvdev.notifies = 0; @@ -455,6 +456,7 @@ int main(int argc, char *argv[]) getrange = getrange_iov; vdev.features = 0; INIT_LIST_HEAD(&vdev.vqs); + spin_lock_init(&vdev.vqs_list_lock); while (argv[1]) { if (strcmp(argv[1], "--indirect") == 0) -- cgit From 258896fcc786b4e7db238eba26f6dd080e0ff41e Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Sat, 15 Oct 2022 23:41:26 -0400 Subject: virtio-blk: use a helper to handle request queuing errors Define a new helper function, virtblk_fail_to_queue(), to clean up the error handling code in virtio_queue_rq(). Signed-off-by: Dmitry Fomichev Message-Id: <20221016034127.330942-2-dmitry.fomichev@wdc.com> Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 68bd2f7961b3..271a9878fa8b 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -315,6 +315,19 @@ static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) virtqueue_notify(vq->vq); } +static blk_status_t virtblk_fail_to_queue(struct request *req, int rc) +{ + virtblk_cleanup_cmd(req); + switch (rc) { + case -ENOSPC: + return BLK_STS_DEV_RESOURCE; + case -ENOMEM: + return BLK_STS_RESOURCE; + default: + return BLK_STS_IOERR; + } +} + static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, struct virtio_blk *vblk, struct request *req, @@ -327,10 +340,8 @@ static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, return status; vbr->sg_table.nents = virtblk_map_data(hctx, req, vbr); - if (unlikely(vbr->sg_table.nents < 0)) { - virtblk_cleanup_cmd(req); - return BLK_STS_RESOURCE; - } + if (unlikely(vbr->sg_table.nents < 0)) + return virtblk_fail_to_queue(req, -ENOMEM); blk_mq_start_request(req); @@ -364,15 +375,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); virtblk_unmap_data(req, vbr); - virtblk_cleanup_cmd(req); - switch (err) { - case -ENOSPC: - return BLK_STS_DEV_RESOURCE; - case -ENOMEM: - return BLK_STS_RESOURCE; - default: - return BLK_STS_IOERR; - } + return virtblk_fail_to_queue(req, err); } if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) -- cgit From 8e6a8d7a3dd93e93645be061692cb4ee6702dff0 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 26 Dec 2022 16:13:27 +0900 Subject: net: ethernet: renesas: rswitch: Fix error path in renesas_eth_sw_probe() If rswitch_init() returns non-zero and this driver is re-probed, the following error happens: renesas_eth_sw e6880000.ethernet: Unbalanced pm_runtime_enable! So, fix error path in renesas_eth_sw_probe(). Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/rswitch.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index e42ceaa0099f..473d86bdf97d 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1786,6 +1786,11 @@ static int renesas_eth_sw_probe(struct platform_device *pdev) pm_runtime_get_sync(&pdev->dev); ret = rswitch_init(priv); + if (ret < 0) { + pm_runtime_put(&pdev->dev); + pm_runtime_disable(&pdev->dev); + return ret; + } device_set_wakeup_capable(&pdev->dev, 1); -- cgit From bd2adfe3b3b863c883309bcc915f13c831ca88da Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 26 Dec 2022 16:13:28 +0900 Subject: net: ethernet: renesas: rswitch: Fix getting mac address from device tree To get mac address from device tree which is from each ethernet-port, fix the first argument of of_get_ethdev_address(). Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/rswitch.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 473d86bdf97d..6441892636db 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1578,6 +1578,7 @@ static int rswitch_device_alloc(struct rswitch_private *priv, int index) { struct platform_device *pdev = priv->pdev; struct rswitch_device *rdev; + struct device_node *port; struct net_device *ndev; int err; @@ -1606,7 +1607,9 @@ static int rswitch_device_alloc(struct rswitch_private *priv, int index) netif_napi_add(ndev, &rdev->napi, rswitch_poll); - err = of_get_ethdev_address(pdev->dev.of_node, ndev); + port = rswitch_get_port_node(rdev); + err = of_get_ethdev_address(port, ndev); + of_node_put(port); if (err) { if (is_valid_ether_addr(rdev->etha->mac_addr)) eth_hw_addr_set(ndev, rdev->etha->mac_addr); -- cgit From 0020ae2a4aa81becd182231bf48acd66c86c86dd Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Mon, 26 Dec 2022 22:19:36 -0500 Subject: bnxt_en: fix devlink port registration to netdev We don't register a devlink port in case of a VF so avoid setting the devlink pointer to netdev. Also, SET_NETDEV_DEVLINK_PORT has to be moved so that we determine whether the device is PF/VF first. This fixes the NULL pointer dereference of devlink_port->devlink when creating VFs: BUG: kernel NULL pointer dereference, address: 0000000000000160 PGD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 14 PID: 388 Comm: kworker/14:1 Kdump: loaded Not tainted 6.1.0-rc8 #5 Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021 Workqueue: events work_for_cpu_fn RIP: 0010:devlink_nl_port_handle_size+0xb/0x50 Code: 83 c4 10 5b 5d c3 cc cc cc cc b8 a6 ff ff ff eb de e8 c9 59 21 00 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 53 48 8b 47 20 <48> 8b a8 60 01 00 00 48 8b 45 60 48 8b 38 e8 92 90 1a 00 48 8b 7d RSP: 0018:ff4fe5394846fcd8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000794 RCX: 0000000000000000 RDX: ff1f129683a30a40 RSI: 0000000000000008 RDI: ff1f1296bb496188 RBP: 0000000000000334 R08: 0000000000000cc0 R09: 0000000000000000 R10: ff1f1296bb494298 R11: ffffffffffffffc0 R12: 0000000000000000 R13: 0000000000000000 R14: ff1f1296bb494000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ff1f129e5fa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000160 CR3: 000000131f610006 CR4: 0000000000771ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: if_nlmsg_size+0x14a/0x220 rtmsg_ifinfo_build_skb+0x3c/0x100 rtmsg_ifinfo+0x9c/0xc0 register_netdevice+0x59d/0x670 register_netdev+0x1c/0x40 bnxt_init_one+0x674/0xa60 [bnxt_en] local_pci_probe+0x42/0x80 work_for_cpu_fn+0x13/0x20 process_one_work+0x1e2/0x3b0 ? rescuer_thread+0x390/0x390 worker_thread+0x1c4/0x3a0 ? rescuer_thread+0x390/0x390 kthread+0xd6/0x100 ? kthread_complete_and_exit+0x20/0x20 Fixes: ac73d4bf2cda ("net: make drivers to use SET_NETDEV_DEVLINK_PORT to set devlink_port") Cc: Jiri Pirko Signed-off-by: Vikas Gupta Reviewed-by: Andy Gospodarek Reviewed-by: Kalesh Anakkur Purayil Reviewed-by: Damodharam Ammepalli Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4c7d07c684c4..93d32b333007 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -13591,7 +13591,6 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return -ENOMEM; bp = netdev_priv(dev); - SET_NETDEV_DEVLINK_PORT(dev, &bp->dl_port); bp->board_idx = ent->driver_data; bp->msg_enable = BNXT_DEF_MSG_ENABLE; bnxt_set_max_func_irqs(bp, max_irqs); @@ -13599,6 +13598,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (bnxt_vf_pciid(bp->board_idx)) bp->flags |= BNXT_FLAG_VF; + /* No devlink port registration in case of a VF */ + if (BNXT_PF(bp)) + SET_NETDEV_DEVLINK_PORT(dev, &bp->dl_port); + if (pdev->msix_cap) bp->flags |= BNXT_FLAG_MSIX_CAP; -- cgit From bbfc17e50ba2ed18dfef46b1c433d50a58566bf1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 26 Dec 2022 22:19:37 -0500 Subject: bnxt_en: Simplify bnxt_xdp_buff_init() bnxt_xdp_buff_init() does not modify the data_ptr or the len parameters, so no need to pass in the addresses of these parameters. Fixes: b231c3f3414c ("bnxt: refactor bnxt_rx_xdp to separate xdp_init_buff/xdp_prepare_buff") Reviewed-by: Andy Gospodarek Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 6 +++--- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 93d32b333007..b8639b7e6b2b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1925,7 +1925,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, dma_addr = rx_buf->mapping; if (bnxt_xdp_attached(bp, rxr)) { - bnxt_xdp_buff_init(bp, rxr, cons, &data_ptr, &len, &xdp); + bnxt_xdp_buff_init(bp, rxr, cons, data_ptr, len, &xdp); if (agg_bufs) { u32 frag_len = bnxt_rx_agg_pages_xdp(bp, cpr, &xdp, cp_cons, agg_bufs, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c3065ec0a479..1847f191577d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -177,7 +177,7 @@ bool bnxt_xdp_attached(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) } void bnxt_xdp_buff_init(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, - u16 cons, u8 **data_ptr, unsigned int *len, + u16 cons, u8 *data_ptr, unsigned int len, struct xdp_buff *xdp) { struct bnxt_sw_rx_bd *rx_buf; @@ -191,13 +191,13 @@ void bnxt_xdp_buff_init(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, offset = bp->rx_offset; mapping = rx_buf->mapping - bp->rx_dma_offset; - dma_sync_single_for_cpu(&pdev->dev, mapping + offset, *len, bp->rx_dir); + dma_sync_single_for_cpu(&pdev->dev, mapping + offset, len, bp->rx_dir); if (bp->xdp_has_frags) buflen = BNXT_PAGE_MODE_BUF_SIZE + offset; xdp_init_buff(xdp, buflen, &rxr->xdp_rxq); - xdp_prepare_buff(xdp, *data_ptr - offset, offset, *len, false); + xdp_prepare_buff(xdp, data_ptr - offset, offset, len, false); } void bnxt_xdp_buff_frags_free(struct bnxt_rx_ring_info *rxr, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 505911ae095d..2bbdb8e7c506 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -27,7 +27,7 @@ int bnxt_xdp_xmit(struct net_device *dev, int num_frames, bool bnxt_xdp_attached(struct bnxt *bp, struct bnxt_rx_ring_info *rxr); void bnxt_xdp_buff_init(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, - u16 cons, u8 **data_ptr, unsigned int *len, + u16 cons, u8 *data_ptr, unsigned int len, struct xdp_buff *xdp); void bnxt_xdp_buff_frags_free(struct bnxt_rx_ring_info *rxr, struct xdp_buff *xdp); -- cgit From 9b3e607871ea5ee90f10f5be3965fc07f2aa3ef7 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 26 Dec 2022 22:19:38 -0500 Subject: bnxt_en: Fix XDP RX path The XDP program can change the starting address of the RX data buffer and this information needs to be passed back from bnxt_rx_xdp() to bnxt_rx_pkt() for the XDP_PASS case so that the SKB can point correctly to the modified buffer address. Add back the data_ptr parameter to bnxt_rx_xdp() to make this work. Fixes: b231c3f3414c ("bnxt: refactor bnxt_rx_xdp to separate xdp_init_buff/xdp_prepare_buff") Reviewed-by: Andy Gospodarek Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 +++++-- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b8639b7e6b2b..1acabfe26db1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1940,7 +1940,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, } if (xdp_active) { - if (bnxt_rx_xdp(bp, rxr, cons, xdp, data, &len, event)) { + if (bnxt_rx_xdp(bp, rxr, cons, xdp, data, &data_ptr, &len, event)) { rc = 1; goto next_rx; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 1847f191577d..2ceeaa818c1c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -222,7 +222,8 @@ void bnxt_xdp_buff_frags_free(struct bnxt_rx_ring_info *rxr, * false - packet should be passed to the stack. */ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, - struct xdp_buff xdp, struct page *page, unsigned int *len, u8 *event) + struct xdp_buff xdp, struct page *page, u8 **data_ptr, + unsigned int *len, u8 *event) { struct bpf_prog *xdp_prog = READ_ONCE(rxr->xdp_prog); struct bnxt_tx_ring_info *txr; @@ -255,8 +256,10 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, *event &= ~BNXT_RX_EVENT; *len = xdp.data_end - xdp.data; - if (orig_data != xdp.data) + if (orig_data != xdp.data) { offset = xdp.data - xdp.data_hard_start; + *data_ptr = xdp.data_hard_start + offset; + } switch (act) { case XDP_PASS: diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 2bbdb8e7c506..ea430d6961df 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -18,8 +18,8 @@ struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp, struct xdp_buff *xdp); void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts); bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, - struct xdp_buff xdp, struct page *page, unsigned int *len, - u8 *event); + struct xdp_buff xdp, struct page *page, u8 **data_ptr, + unsigned int *len, u8 *event); int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp); int bnxt_xdp_xmit(struct net_device *dev, int num_frames, struct xdp_frame **frames, u32 flags); -- cgit From 1abeacc1979fa4a756695f5030791d8f0fa934b9 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 26 Dec 2022 22:19:39 -0500 Subject: bnxt_en: Fix first buffer size calculations for XDP multi-buffer The size of the first buffer is always page size, and the useable space is the page size minus the offset and the skb_shared_info size. Make sure SKB and XDP buf sizes match so that the skb_shared_info is at the same offset seen from the SKB and XDP_BUF. build_skb() should be passed PAGE_SIZE. xdp_init_buff() should be passed PAGE_SIZE as well. xdp_get_shared_info_from_buff() will automatically deduct the skb_shared_info size if the XDP buffer has frags. There is no need to keep bp->xdp_has_frags. Change BNXT_PAGE_MODE_BUF_SIZE to BNXT_MAX_PAGE_MODE_MTU_SBUF since this constant is really the MTU with ethernet header size subtracted. Also fix the BNXT_MAX_PAGE_MODE_MTU macro with proper parentheses. Fixes: 32861236190b ("bnxt: change receive ring space parameters") Reviewed-by: Somnath Kotur Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 +++++---- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 15 +++++++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 +------ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1acabfe26db1..a21c6829e301 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -991,8 +991,7 @@ static struct sk_buff *bnxt_rx_multi_page_skb(struct bnxt *bp, dma_addr -= bp->rx_dma_offset; dma_unmap_page_attrs(&bp->pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir, DMA_ATTR_WEAK_ORDERING); - skb = build_skb(page_address(page), BNXT_PAGE_MODE_BUF_SIZE + - bp->rx_dma_offset); + skb = build_skb(page_address(page), PAGE_SIZE); if (!skb) { __free_page(page); return NULL; @@ -3969,8 +3968,10 @@ void bnxt_set_ring_params(struct bnxt *bp) bp->rx_agg_ring_mask = (bp->rx_agg_nr_pages * RX_DESC_CNT) - 1; if (BNXT_RX_PAGE_MODE(bp)) { - rx_space = BNXT_PAGE_MODE_BUF_SIZE; - rx_size = BNXT_MAX_PAGE_MODE_MTU; + rx_space = PAGE_SIZE; + rx_size = PAGE_SIZE - + ALIGN(max(NET_SKB_PAD, XDP_PACKET_HEADROOM), 8) - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); } else { rx_size = SKB_DATA_ALIGN(BNXT_RX_COPY_THRESH + NET_IP_ALIGN); rx_space = rx_size + NET_SKB_PAD + diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 41c6dd0ae447..5163ef4a49ea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -591,12 +591,20 @@ struct nqe_cn { #define BNXT_RX_PAGE_SIZE (1 << BNXT_RX_PAGE_SHIFT) #define BNXT_MAX_MTU 9500 -#define BNXT_PAGE_MODE_BUF_SIZE \ + +/* First RX buffer page in XDP multi-buf mode + * + * +-------------------------------------------------------------------------+ + * | XDP_PACKET_HEADROOM | bp->rx_buf_use_size | skb_shared_info| + * | (bp->rx_dma_offset) | | | + * +-------------------------------------------------------------------------+ + */ +#define BNXT_MAX_PAGE_MODE_MTU_SBUF \ ((unsigned int)PAGE_SIZE - VLAN_ETH_HLEN - NET_IP_ALIGN - \ XDP_PACKET_HEADROOM) #define BNXT_MAX_PAGE_MODE_MTU \ - BNXT_PAGE_MODE_BUF_SIZE - \ - SKB_DATA_ALIGN((unsigned int)sizeof(struct skb_shared_info)) + (BNXT_MAX_PAGE_MODE_MTU_SBUF - \ + SKB_DATA_ALIGN((unsigned int)sizeof(struct skb_shared_info))) #define BNXT_MIN_PKT_SIZE 52 @@ -2134,7 +2142,6 @@ struct bnxt { #define BNXT_DUMP_CRASH 1 struct bpf_prog *xdp_prog; - u8 xdp_has_frags; struct bnxt_ptp_cfg *ptp_cfg; u8 ptp_all_rx_tstamp; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 2ceeaa818c1c..36d5202c0aee 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -193,9 +193,6 @@ void bnxt_xdp_buff_init(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, mapping = rx_buf->mapping - bp->rx_dma_offset; dma_sync_single_for_cpu(&pdev->dev, mapping + offset, len, bp->rx_dir); - if (bp->xdp_has_frags) - buflen = BNXT_PAGE_MODE_BUF_SIZE + offset; - xdp_init_buff(xdp, buflen, &rxr->xdp_rxq); xdp_prepare_buff(xdp, data_ptr - offset, offset, len, false); } @@ -404,10 +401,8 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) netdev_warn(dev, "ethtool rx/tx channels must be combined to support XDP.\n"); return -EOPNOTSUPP; } - if (prog) { + if (prog) tx_xdp = bp->rx_nr_rings; - bp->xdp_has_frags = prog->aux->xdp_has_frags; - } tc = netdev_get_num_tc(dev); if (!tc) -- cgit From a056ebcc30e2f78451d66f615d2f6bdada3e6438 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 26 Dec 2022 22:19:40 -0500 Subject: bnxt_en: Fix HDS and jumbo thresholds for RX packets The recent XDP multi-buffer feature has introduced regressions in the setting of HDS and jumbo thresholds. HDS was accidentally disabled in the nornmal mode without XDP. This patch restores jumbo HDS placement when not in XDP mode. In XDP multi-buffer mode, HDS should be disabled and the jumbo threshold should be set to the usable page size in the first page buffer. Fixes: 32861236190b ("bnxt: change receive ring space parameters") Reviewed-by: Mohammad Shuab Siddique Reviewed-by: Ajit Khaparde Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a21c6829e301..16ce7a90610c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5399,15 +5399,16 @@ static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, u16 vnic_id) req->flags = cpu_to_le32(VNIC_PLCMODES_CFG_REQ_FLAGS_JUMBO_PLACEMENT); req->enables = cpu_to_le32(VNIC_PLCMODES_CFG_REQ_ENABLES_JUMBO_THRESH_VALID); - if (BNXT_RX_PAGE_MODE(bp) && !BNXT_RX_JUMBO_MODE(bp)) { + if (BNXT_RX_PAGE_MODE(bp)) { + req->jumbo_thresh = cpu_to_le16(bp->rx_buf_use_size); + } else { req->flags |= cpu_to_le32(VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV4 | VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6); req->enables |= cpu_to_le32(VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID); + req->jumbo_thresh = cpu_to_le16(bp->rx_copy_thresh); + req->hds_threshold = cpu_to_le16(bp->rx_copy_thresh); } - /* thresholds not implemented in firmware yet */ - req->jumbo_thresh = cpu_to_le16(bp->rx_copy_thresh); - req->hds_threshold = cpu_to_le16(bp->rx_copy_thresh); req->vnic_id = cpu_to_le32(vnic->fw_vnic_id); return hwrm_req_send(bp, req); } -- cgit From b659b613cea2ae39746ca8bd2b69d1985dd9d770 Mon Sep 17 00:00:00 2001 From: Ferry Toth Date: Thu, 22 Dec 2022 21:53:02 +0100 Subject: Revert "usb: ulpi: defer ulpi_register on ulpi_read_id timeout" This reverts commit 8a7b31d545d3a15f0e6f5984ae16f0ca4fd76aac. This patch results in some qemu test failures, specifically xilinx-zynq-a9 machine and zynq-zc702 as well as zynq-zed devicetree files, when trying to boot from USB drive. Link: https://lore.kernel.org/lkml/20221220194334.GA942039@roeck-us.net/ Fixes: 8a7b31d545d3 ("usb: ulpi: defer ulpi_register on ulpi_read_id timeout") Cc: stable@vger.kernel.org Reported-by: Guenter Roeck Signed-off-by: Ferry Toth Link: https://lore.kernel.org/r/20221222205302.45761-1-ftoth@exalondelft.nl Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/ulpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c index 60e8174686a1..d7c8461976ce 100644 --- a/drivers/usb/common/ulpi.c +++ b/drivers/usb/common/ulpi.c @@ -207,7 +207,7 @@ static int ulpi_read_id(struct ulpi *ulpi) /* Test the interface */ ret = ulpi_write(ulpi, ULPI_SCRATCH, 0xaa); if (ret < 0) - return ret; + goto err; ret = ulpi_read(ulpi, ULPI_SCRATCH); if (ret < 0) -- cgit From 2de5bba5890f6604a997c75e754df8082386c9f7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:57:20 +0100 Subject: usb: fotg210: fix OTG-only build The fotg210 module combines the HCD and OTG drivers, which then fails to build when only the USB gadget support is enabled in the kernel but host support is not: aarch64-linux-ld: drivers/usb/fotg210/fotg210-core.o: in function `fotg210_init': fotg210-core.c:(.init.text+0xc): undefined reference to `usb_disabled' Move the check for usb_disabled() after the check for the HCD module, and let the OTG driver still be probed in this configuration. A nicer approach might be to have the common portion built as a library module, with the two platform other files registering their own platform_driver instances separately. Fixes: ddacd6ef44ca ("usb: fotg210: Fix Kconfig for USB host modules") Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20221215165728.2062984-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/fotg210/fotg210-core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/usb/fotg210/fotg210-core.c b/drivers/usb/fotg210/fotg210-core.c index 8a54edf921ac..ee740a6da463 100644 --- a/drivers/usb/fotg210/fotg210-core.c +++ b/drivers/usb/fotg210/fotg210-core.c @@ -144,10 +144,7 @@ static struct platform_driver fotg210_driver = { static int __init fotg210_init(void) { - if (usb_disabled()) - return -ENODEV; - - if (IS_ENABLED(CONFIG_USB_FOTG210_HCD)) + if (IS_ENABLED(CONFIG_USB_FOTG210_HCD) && !usb_disabled()) fotg210_hcd_init(); return platform_driver_register(&fotg210_driver); } -- cgit From a6ce72c0fb6041f9871f880b2d02b294f7f49cb4 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Nov 2022 15:17:52 +0200 Subject: vdpa/mlx5: Fix rule forwarding VLAN to TIR Set the VLAN id to the header values field instead of overwriting the headers criteria field. Before this fix, VLAN filtering would not really work and tagged packets would be forwarded unfiltered to the TIR. Fixes: baf2ad3f6a98 ("vdpa/mlx5: Add RX MAC VLAN filter support") Acked-by: Jason Wang Signed-off-by: Eli Cohen Message-Id: <20221114131759.57883-2-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 90913365def4..3fb06dcee943 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1468,11 +1468,13 @@ static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16); eth_broadcast_addr(dmac_c); ether_addr_copy(dmac_v, mac); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); + if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid); + } if (tagged) { MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1); - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, vid); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid); } flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; -- cgit From 5aec804936bbff182081f1cdc271fcb76af1a4ff Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Nov 2022 15:17:53 +0200 Subject: vdpa/mlx5: Return error on vlan ctrl commands if not supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check if VIRTIO_NET_F_CTRL_VLAN is negotiated and return error if control VQ command is received. Signed-off-by: Eli Cohen Message-Id: <20221114131759.57883-3-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Eugenio Pérez --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 3fb06dcee943..01da229d22da 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1823,6 +1823,9 @@ static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd) size_t read; u16 id; + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN))) + return status; + switch (cmd) { case VIRTIO_NET_CTRL_VLAN_ADD: read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan)); -- cgit From 1ab53760d322c82fb4cb5e81b5817065801e3ec4 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Nov 2022 15:17:54 +0200 Subject: vdpa/mlx5: Fix wrong mac address deletion Delete the old MAC from the table and not the new one which is not there yet. Fixes: baf2ad3f6a98 ("vdpa/mlx5: Add RX MAC VLAN filter support") Acked-by: Jason Wang Signed-off-by: Eli Cohen Message-Id: <20221114131759.57883-4-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 01da229d22da..b06260a37680 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1686,7 +1686,7 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) /* Need recreate the flow table entry, so that the packet could forward back */ - mac_vlan_del(ndev, ndev->config.mac, 0, false); + mac_vlan_del(ndev, mac_back, 0, false); if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) { mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n"); -- cgit From 0dbc1b4ae07d003b2e88ba9d4142846320f8e349 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Nov 2022 15:17:55 +0200 Subject: vdpa/mlx5: Avoid using reslock in event_handler event_handler runs under atomic context and may not acquire reslock. We can still guarantee that the handler won't be called after suspend by clearing nb_registered, unregistering the handler and flushing the workqueue. Signed-off-by: Eli Cohen Message-Id: <20221114131759.57883-5-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index b06260a37680..98dd8ce8af26 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2845,8 +2845,8 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev) int i; down_write(&ndev->reslock); - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); ndev->nb_registered = false; + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); flush_workqueue(ndev->mvdev.wq); for (i = 0; i < ndev->cur_num_vqs; i++) { mvq = &ndev->vqs[i]; @@ -3024,7 +3024,7 @@ static void update_carrier(struct work_struct *work) else ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); - if (ndev->config_cb.callback) + if (ndev->nb_registered && ndev->config_cb.callback) ndev->config_cb.callback(ndev->config_cb.private); kfree(wqent); @@ -3041,21 +3041,13 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p switch (eqe->sub_type) { case MLX5_PORT_CHANGE_SUBTYPE_DOWN: case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: - down_read(&ndev->reslock); - if (!ndev->nb_registered) { - up_read(&ndev->reslock); - return NOTIFY_DONE; - } wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); - if (!wqent) { - up_read(&ndev->reslock); + if (!wqent) return NOTIFY_DONE; - } wqent->mvdev = &ndev->mvdev; INIT_WORK(&wqent->work, update_carrier); queue_work(ndev->mvdev.wq, &wqent->work); - up_read(&ndev->reslock); ret = NOTIFY_OK; break; default: @@ -3242,8 +3234,8 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * struct workqueue_struct *wq; if (ndev->nb_registered) { - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); ndev->nb_registered = false; + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); } wq = mvdev->wq; mvdev->wq = NULL; -- cgit From 38fc462f57ef4e5dc722bab6824854b105de8aa2 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Nov 2022 15:17:56 +0200 Subject: vdpa/mlx5: Avoid overwriting CVQ iotlb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When qemu uses different address spaces for data and control virtqueues, the current code would overwrite the control virtqueue iotlb through the dup_iotlb call. Fix this by referring to the address space identifier and the group to asid mapping to determine which mapping needs to be updated. We also move the address space logic from mlx5 net to core directory. Reported-by: Eugenio Pérez Signed-off-by: Eli Cohen Message-Id: <20221114131759.57883-6-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Eugenio Pérez --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 5 ++-- drivers/vdpa/mlx5/core/mr.c | 44 ++++++++++++++++++++-------------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 49 ++++++++------------------------------ 3 files changed, 39 insertions(+), 59 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 6af9fdbb86b7..058fbe28107e 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -116,8 +116,9 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 *mkey, u32 *in, int inlen); int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey); int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, - bool *change_map); -int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb); + bool *change_map, unsigned int asid); +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + unsigned int asid); void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev); #define mlx5_vdpa_warn(__dev, format, ...) \ diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index a639b9208d41..a4d7ee2339fa 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -511,7 +511,8 @@ out: mutex_unlock(&mr->mkey_mtx); } -static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, unsigned int asid) { struct mlx5_vdpa_mr *mr = &mvdev->mr; int err; @@ -519,42 +520,49 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb if (mr->initialized) return 0; - if (iotlb) - err = create_user_mr(mvdev, iotlb); - else - err = create_dma_mr(mvdev, mr); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { + if (iotlb) + err = create_user_mr(mvdev, iotlb); + else + err = create_dma_mr(mvdev, mr); - if (err) - return err; + if (err) + return err; + } - err = dup_iotlb(mvdev, iotlb); - if (err) - goto out_err; + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) { + err = dup_iotlb(mvdev, iotlb); + if (err) + goto out_err; + } mr->initialized = true; return 0; out_err: - if (iotlb) - destroy_user_mr(mvdev, mr); - else - destroy_dma_mr(mvdev, mr); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { + if (iotlb) + destroy_user_mr(mvdev, mr); + else + destroy_dma_mr(mvdev, mr); + } return err; } -int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + unsigned int asid) { int err; mutex_lock(&mvdev->mr.mkey_mtx); - err = _mlx5_vdpa_create_mr(mvdev, iotlb); + err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid); mutex_unlock(&mvdev->mr.mkey_mtx); return err; } int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, - bool *change_map) + bool *change_map, unsigned int asid) { struct mlx5_vdpa_mr *mr = &mvdev->mr; int err = 0; @@ -566,7 +574,7 @@ int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *io *change_map = true; } if (!*change_map) - err = _mlx5_vdpa_create_mr(mvdev, iotlb); + err = _mlx5_vdpa_create_mr(mvdev, iotlb, asid); mutex_unlock(&mr->mkey_mtx); return err; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 98dd8ce8af26..3a6dbbc6440d 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2394,7 +2394,8 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev) } } -static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, + struct vhost_iotlb *iotlb, unsigned int asid) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); int err; @@ -2406,7 +2407,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb teardown_driver(ndev); mlx5_vdpa_destroy_mr(mvdev); - err = mlx5_vdpa_create_mr(mvdev, iotlb); + err = mlx5_vdpa_create_mr(mvdev, iotlb, asid); if (err) goto err_mr; @@ -2587,7 +2588,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) ++mvdev->generation; if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - if (mlx5_vdpa_create_mr(mvdev, NULL)) + if (mlx5_vdpa_create_mr(mvdev, NULL, 0)) mlx5_vdpa_warn(mvdev, "create MR failed\n"); } up_write(&ndev->reslock); @@ -2623,41 +2624,20 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) return mvdev->generation; } -static int set_map_control(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) -{ - u64 start = 0ULL, last = 0ULL - 1; - struct vhost_iotlb_map *map; - int err = 0; - - spin_lock(&mvdev->cvq.iommu_lock); - vhost_iotlb_reset(mvdev->cvq.iotlb); - - for (map = vhost_iotlb_itree_first(iotlb, start, last); map; - map = vhost_iotlb_itree_next(map, start, last)) { - err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, - map->last, map->addr, map->perm); - if (err) - goto out; - } - -out: - spin_unlock(&mvdev->cvq.iommu_lock); - return err; -} - -static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, + unsigned int asid) { bool change_map; int err; - err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); + err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map, asid); if (err) { mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); return err; } if (change_map) - err = mlx5_vdpa_change_map(mvdev, iotlb); + err = mlx5_vdpa_change_map(mvdev, iotlb, asid); return err; } @@ -2670,16 +2650,7 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, int err = -EINVAL; down_write(&ndev->reslock); - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { - err = set_map_data(mvdev, iotlb); - if (err) - goto out; - } - - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) - err = set_map_control(mvdev, iotlb); - -out: + err = set_map_data(mvdev, iotlb, asid); up_write(&ndev->reslock); return err; } @@ -3182,7 +3153,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, goto err_mpfs; if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - err = mlx5_vdpa_create_mr(mvdev, NULL); + err = mlx5_vdpa_create_mr(mvdev, NULL, 0); if (err) goto err_res; } -- cgit From 344686136d73501a18a9621de690ff7824a3d129 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Thu, 20 Oct 2022 23:27:33 -0700 Subject: virtio_pci: use helper function is_power_of_2() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use helper function is_power_of_2() to check if num is power of two. Minor readability improvement. Signed-off-by: Shaoqin Huang Message-Id: <20221021062734.228881-2-shaoqin.huang@intel.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_pci_modern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index c3b9f2761849..207294bd7b9d 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -310,7 +310,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); - if (num & (num - 1)) { + if (!is_power_of_2(num)) { dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num); return ERR_PTR(-EINVAL); } -- cgit From b9d978a89296c57fbbbd8ea647c303ce4d37028f Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Thu, 20 Oct 2022 23:27:34 -0700 Subject: virtio_ring: use helper function is_power_of_2() Use helper function is_power_of_2() to check if num is power of two. Minor readability improvement. Signed-off-by: Shaoqin Huang Message-Id: <20221021062734.228881-3-shaoqin.huang@intel.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 2e7689bb933b..723c4e29e1d3 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1052,7 +1052,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, dma_addr_t dma_addr; /* We assume num is a power of 2. */ - if (num & (num - 1)) { + if (!is_power_of_2(num)) { dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); return -EINVAL; } -- cgit From a9f0a19ff7700cc8a30db2496f40d18490dcb9df Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Oct 2022 14:37:56 +0100 Subject: RDMA/mlx5: remove variable i Variable i is just being incremented and it's never used anywhere else. The variable and the increment are redundant so remove it. Signed-off-by: Colin Ian King Message-Id: <20221024133756.2158497-1-colin.i.king@gmail.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index a4d7ee2339fa..0a1e0b0dc37e 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -311,7 +311,6 @@ static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, u64 start, u64 size, u8 u64 st; u64 sz; int err; - int i = 0; st = start; while (size) { @@ -336,7 +335,6 @@ static int add_direct_chain(struct mlx5_vdpa_dev *mvdev, u64 start, u64 size, u8 mr->num_directs++; mr->num_klms++; st += sz; - i++; } list_splice_tail(&tmp, &mr->head); return 0; -- cgit From b66ead2d0ecac00c3a06a6218af5411cb5fcb5d5 Mon Sep 17 00:00:00 2001 From: Angus Chen Date: Tue, 1 Nov 2022 19:16:54 +0800 Subject: virtio_pci: modify ENOENT to EINVAL Virtio_crypto use max_data_queues+1 to setup vqs, we use vp_modern_get_num_queues to protect the vq range in setup_vq. We could enter index >= vp_modern_get_num_queues(mdev) in setup_vq if common->num_queues is not set well,and it return -ENOENT. It is better to use -EINVAL instead. Signed-off-by: Angus Chen Message-Id: <20221101111655.1947-1-angus.chen@jaguarmicro.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 207294bd7b9d..9e496e288cfa 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -303,7 +303,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, int err; if (index >= vp_modern_get_num_queues(mdev)) - return ERR_PTR(-ENOENT); + return ERR_PTR(-EINVAL); /* Check if queue is either not available or already active. */ num = vp_modern_get_queue_size(mdev, index); -- cgit From 75e4ab9735a5a70612dd06461ca372b897bf371c Mon Sep 17 00:00:00 2001 From: Shaomin Deng Date: Sat, 5 Nov 2022 11:51:51 -0400 Subject: tools: Delete the unneeded semicolon after curly braces Unneeded semicolon after curly braces, so delete it. Signed-off-by: Shaomin Deng Message-Id: <20221105155151.12155-1-dengshaomin@cdjrlc.com> Signed-off-by: Michael S. Tsirkin --- tools/virtio/virtio-trace/trace-agent-ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/virtio/virtio-trace/trace-agent-ctl.c b/tools/virtio/virtio-trace/trace-agent-ctl.c index 73d253d4b559..39860be6e2d8 100644 --- a/tools/virtio/virtio-trace/trace-agent-ctl.c +++ b/tools/virtio/virtio-trace/trace-agent-ctl.c @@ -75,7 +75,7 @@ static int wait_order(int ctl_fd) if (ret) break; - }; + } return ret; -- cgit From aeca7ff254843d49a8739f07f7dab1341450111d Mon Sep 17 00:00:00 2001 From: ruanjinjie Date: Thu, 10 Nov 2022 16:23:48 +0800 Subject: vdpa_sim: fix possible memory leak in vdpasim_net_init() and vdpasim_blk_init() Inject fault while probing module, if device_register() fails in vdpasim_net_init() or vdpasim_blk_init(), but the refcount of kobject is not decreased to 0, the name allocated in dev_set_name() is leaked. Fix this by calling put_device(), so that name can be freed in callback function kobject_cleanup(). (vdpa_sim_net) unreferenced object 0xffff88807eebc370 (size 16): comm "modprobe", pid 3848, jiffies 4362982860 (age 18.153s) hex dump (first 16 bytes): 76 64 70 61 73 69 6d 5f 6e 65 74 00 6b 6b 6b a5 vdpasim_net.kkk. backtrace: [] __kmalloc_node_track_caller+0x4e/0x150 [] kstrdup+0x33/0x60 [] kobject_set_name_vargs+0x41/0x110 [] dev_set_name+0xab/0xe0 [] device_add+0xe3/0x1a80 [] 0xffffffffa0270013 [] do_one_initcall+0x87/0x2e0 [] do_init_module+0x1ab/0x640 [] load_module+0x5d00/0x77f0 [] __do_sys_finit_module+0x110/0x1b0 [] do_syscall_64+0x35/0x80 [] entry_SYSCALL_64_after_hwframe+0x46/0xb0 (vdpa_sim_blk) unreferenced object 0xffff8881070c1250 (size 16): comm "modprobe", pid 6844, jiffies 4364069319 (age 17.572s) hex dump (first 16 bytes): 76 64 70 61 73 69 6d 5f 62 6c 6b 00 6b 6b 6b a5 vdpasim_blk.kkk. backtrace: [] __kmalloc_node_track_caller+0x4e/0x150 [] kstrdup+0x33/0x60 [] kobject_set_name_vargs+0x41/0x110 [] dev_set_name+0xab/0xe0 [] device_add+0xe3/0x1a80 [] 0xffffffffa0220013 [] do_one_initcall+0x87/0x2e0 [] do_init_module+0x1ab/0x640 [] load_module+0x5d00/0x77f0 [] __do_sys_finit_module+0x110/0x1b0 [] do_syscall_64+0x35/0x80 [] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: 899c4d187f6a ("vdpa_sim_blk: add support for vdpa management tool") Fixes: a3c06ae158dd ("vdpa_sim_net: Add support for user supported devices") Signed-off-by: ruanjinjie Reviewed-by: Stefano Garzarella Message-Id: <20221110082348.4105476-1-ruanjinjie@huawei.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 4 +++- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index c6db1a1baf76..f745926237a8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -427,8 +427,10 @@ static int __init vdpasim_blk_init(void) int ret; ret = device_register(&vdpasim_blk_mgmtdev); - if (ret) + if (ret) { + put_device(&vdpasim_blk_mgmtdev); return ret; + } ret = vdpa_mgmtdev_register(&mgmt_dev); if (ret) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index c3cb225ea469..11f5a121df24 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -305,8 +305,10 @@ static int __init vdpasim_net_init(void) int ret; ret = device_register(&vdpasim_net_mgmtdev); - if (ret) + if (ret) { + put_device(&vdpasim_net_mgmtdev); return ret; + } ret = vdpa_mgmtdev_register(&mgmt_dev); if (ret) -- cgit From 7a4efe182ca61fb3e5307e69b261c57cbf434cd4 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 8 Nov 2022 10:17:05 +0000 Subject: vhost/vsock: Fix error handling in vhost_vsock_init() A problem about modprobe vhost_vsock failed is triggered with the following log given: modprobe: ERROR: could not insert 'vhost_vsock': Device or resource busy The reason is that vhost_vsock_init() returns misc_register() directly without checking its return value, if misc_register() failed, it returns without calling vsock_core_unregister() on vhost_transport, resulting the vhost_vsock can never be installed later. A simple call graph is shown as below: vhost_vsock_init() vsock_core_register() # register vhost_transport misc_register() device_create_with_groups() device_create_groups_vargs() dev = kzalloc(...) # OOM happened # return without unregister vhost_transport Fix by calling vsock_core_unregister() when misc_register() returns error. Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") Signed-off-by: Yuan Can Message-Id: <20221108101705.45981-1-yuancan@huawei.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Acked-by: Jason Wang --- drivers/vhost/vsock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index cd6f7776013a..a2b374372363 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -959,7 +959,14 @@ static int __init vhost_vsock_init(void) VSOCK_TRANSPORT_F_H2G); if (ret < 0) return ret; - return misc_register(&vhost_vsock_misc); + + ret = misc_register(&vhost_vsock_misc); + if (ret) { + vsock_core_unregister(&vhost_transport.transport); + return ret; + } + + return 0; }; static void __exit vhost_vsock_exit(void) -- cgit From f85efa9b0f5381874f727bd98f56787840313f0b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 9 Nov 2022 11:25:02 +0100 Subject: vringh: fix range used in iotlb_translate() vhost_iotlb_itree_first() requires `start` and `last` parameters to search for a mapping that overlaps the range. In iotlb_translate() we cyclically call vhost_iotlb_itree_first(), incrementing `addr` by the amount already translated, so rightly we move the `start` parameter passed to vhost_iotlb_itree_first(), but we should hold the `last` parameter constant. Let's fix it by saving the `last` parameter value before incrementing `addr` in the loop. Fixes: 9ad9c49cfe97 ("vringh: IOTLB support") Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Message-Id: <20221109102503.18816-2-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index c9f5c8ea3afb..33eb941fcf15 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1102,7 +1102,7 @@ static int iotlb_translate(const struct vringh *vrh, struct vhost_iotlb_map *map; struct vhost_iotlb *iotlb = vrh->iotlb; int ret = 0; - u64 s = 0; + u64 s = 0, last = addr + len - 1; spin_lock(vrh->iotlb_lock); @@ -1114,8 +1114,7 @@ static int iotlb_translate(const struct vringh *vrh, break; } - map = vhost_iotlb_itree_first(iotlb, addr, - addr + len - 1); + map = vhost_iotlb_itree_first(iotlb, addr, last); if (!map || map->start > addr) { ret = -EINVAL; break; -- cgit From 98047313cdb46828093894d0ac8b1183b8b317f9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 9 Nov 2022 11:25:03 +0100 Subject: vhost: fix range used in translate_desc() vhost_iotlb_itree_first() requires `start` and `last` parameters to search for a mapping that overlaps the range. In translate_desc() we cyclically call vhost_iotlb_itree_first(), incrementing `addr` by the amount already translated, so rightly we move the `start` parameter passed to vhost_iotlb_itree_first(), but we should hold the `last` parameter constant. Let's fix it by saving the `last` parameter value before incrementing `addr` in the loop. Fixes: a9709d6874d5 ("vhost: convert pre sorted vhost memory array to interval tree") Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Message-Id: <20221109102503.18816-3-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5c9fe3c9c364..cbe72bfd2f1f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2053,7 +2053,7 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct vhost_dev *dev = vq->dev; struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem; struct iovec *_iov; - u64 s = 0; + u64 s = 0, last = addr + len - 1; int ret = 0; while ((u64)len > s) { @@ -2063,7 +2063,7 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, break; } - map = vhost_iotlb_itree_first(umem, addr, addr + len - 1); + map = vhost_iotlb_itree_first(umem, addr, last); if (map == NULL || map->start > addr) { if (umem != dev->iotlb) { ret = -EFAULT; -- cgit From c070c1912a83432530cbb4271d5b9b11fa36b67a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 9 Nov 2022 16:42:13 +0100 Subject: vhost-vdpa: fix an iotlb memory leak Before commit 3d5698793897 ("vhost-vdpa: introduce asid based IOTLB") we called vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1) during release to free all the resources allocated when processing user IOTLB messages through vhost_vdpa_process_iotlb_update(). That commit changed the handling of IOTLB a bit, and we accidentally removed some code called during the release. We partially fixed this with commit 037d4305569a ("vhost-vdpa: call vhost_vdpa_cleanup during the release") but a potential memory leak is still there as showed by kmemleak if the application does not send VHOST_IOTLB_INVALIDATE or crashes: unreferenced object 0xffff888007fbaa30 (size 16): comm "blkio-bench", pid 914, jiffies 4294993521 (age 885.500s) hex dump (first 16 bytes): 40 73 41 07 80 88 ff ff 00 00 00 00 00 00 00 00 @sA............. backtrace: [<0000000087736d2a>] kmem_cache_alloc_trace+0x142/0x1c0 [<0000000060740f50>] vhost_vdpa_process_iotlb_msg+0x68c/0x901 [vhost_vdpa] [<0000000083e8e205>] vhost_chr_write_iter+0xc0/0x4a0 [vhost] [<000000008f2f414a>] vhost_vdpa_chr_write_iter+0x18/0x20 [vhost_vdpa] [<00000000de1cd4a0>] vfs_write+0x216/0x4b0 [<00000000a2850200>] ksys_write+0x71/0xf0 [<00000000de8e720b>] __x64_sys_write+0x19/0x20 [<0000000018b12cbb>] do_syscall_64+0x3f/0x90 [<00000000986ec465>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Let's fix this calling vhost_vdpa_iotlb_unmap() on the whole range in vhost_vdpa_remove_as(). We move that call before vhost_dev_cleanup() since we need a valid v->vdev.mm in vhost_vdpa_pa_unmap(). vhost_iotlb_reset() call can be removed, since vhost_vdpa_iotlb_unmap() on the whole range removes all the entries. The kmemleak log reported was observed with a vDPA device that has `use_va` set to true (e.g. VDUSE). This patch has been tested with both types of devices. Fixes: 037d4305569a ("vhost-vdpa: call vhost_vdpa_cleanup during the release") Fixes: 3d5698793897 ("vhost-vdpa: introduce asid based IOTLB") Signed-off-by: Stefano Garzarella Message-Id: <20221109154213.146789-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 166044642fd5..b08e07fc7d1f 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -65,6 +65,10 @@ static DEFINE_IDA(vhost_vdpa_ida); static dev_t vhost_vdpa_major; +static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 start, u64 last); + static inline u32 iotlb_to_asid(struct vhost_iotlb *iotlb) { struct vhost_vdpa_as *as = container_of(iotlb, struct @@ -135,7 +139,7 @@ static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid) return -EINVAL; hlist_del(&as->hash_link); - vhost_iotlb_reset(&as->iotlb); + vhost_vdpa_iotlb_unmap(v, &as->iotlb, 0ULL, 0ULL - 1); kfree(as); return 0; @@ -1162,14 +1166,14 @@ static void vhost_vdpa_cleanup(struct vhost_vdpa *v) struct vhost_vdpa_as *as; u32 asid; - vhost_dev_cleanup(&v->vdev); - kfree(v->vdev.vqs); - for (asid = 0; asid < v->vdpa->nas; asid++) { as = asid_to_as(v, asid); if (as) vhost_vdpa_remove_as(v, asid); } + + vhost_dev_cleanup(&v->vdev); + kfree(v->vdev.vqs); } static int vhost_vdpa_open(struct inode *inode, struct file *filep) -- cgit From f4e468f708386ce5fa6878a7ef43a9818ceeaecf Mon Sep 17 00:00:00 2001 From: Angus Chen Date: Thu, 10 Nov 2022 11:01:23 +0800 Subject: virtio_blk: use UINT_MAX instead of -1U We use UINT_MAX to limit max_discard_sectors in virtblk_probe, we can use UINT_MAX to limit max_hw_sectors for consistencies. No functional change intended. Signed-off-by: Angus Chen Message-Id: <20221110030124.1986-1-angus.chen@jaguarmicro.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- drivers/block/virtio_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 271a9878fa8b..dcbf86cd2155 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -994,7 +994,7 @@ static int virtblk_probe(struct virtio_device *vdev) blk_queue_max_segments(q, sg_elems); /* No real sector limit. */ - blk_queue_max_hw_sectors(q, -1U); + blk_queue_max_hw_sectors(q, UINT_MAX); max_size = virtio_max_dma_size(vdev); -- cgit From 794ec498c9fa79e6bfd71b931410d5897a9c00d4 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 10 Nov 2022 15:13:35 +0100 Subject: vdpa_sim: fix vringh initialization in vdpasim_queue_ready() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we initialize vringh, we should pass the features and the number of elements in the virtqueue negotiated with the driver, otherwise operations with vringh may fail. This was discovered in a case where the driver sets a number of elements in the virtqueue different from the value returned by .get_vq_num_max(). In vdpasim_vq_reset() is safe to initialize the vringh with default values, since the virtqueue will not be used until vdpasim_queue_ready() is called again. Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator") Signed-off-by: Stefano Garzarella Message-Id: <20221110141335.62171-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Eugenio Pérez --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index b071f0d842fb..b20689f8fe89 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -67,8 +67,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; - vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, - VDPASIM_QUEUE_MAX, false, + vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, false, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) (uintptr_t)vq->driver_addr, -- cgit From a4722f64f924a9992efc08d141c21b2da02b70f3 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sun, 13 Nov 2022 15:07:42 +0800 Subject: tools/virtio: Variable type completion Replace "unsigned" with "unsigned int" Signed-off-by: wangjianli Message-Id: <20221113070742.48271-1-wangjianli@cdjrlc.com> Signed-off-by: Michael S. Tsirkin --- tools/virtio/virtio_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 86a410ddcedd..120062f94590 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -173,7 +173,7 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, long started = 0, completed = 0, next_reset = reset_n; long completed_before, started_before; int r, test = 1; - unsigned len; + unsigned int len; long long spurious = 0; const bool random_batch = batch == RANDOM_BATCH; -- cgit From b1d65f717cd6305a396a8738e022c6f7c65cfbe8 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 14 Nov 2022 11:07:40 +0000 Subject: virtio-crypto: fix memory leak in virtio_crypto_alg_skcipher_close_session() 'vc_ctrl_req' is alloced in virtio_crypto_alg_skcipher_close_session(), and should be freed in the invalid ctrl_status->status error handling case. Otherwise there is a memory leak. Fixes: 0756ad15b1fe ("virtio-crypto: use private buffer for control request") Signed-off-by: Wei Yongjun Message-Id: <20221114110740.537276-1-weiyongjun@huaweicloud.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Gonglei Acked-by: zhenwei pi Acked-by: Jason Wang --- drivers/crypto/virtio/virtio_crypto_skcipher_algs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c index e553ccadbcbc..e5876286828b 100644 --- a/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c +++ b/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c @@ -239,7 +239,8 @@ static int virtio_crypto_alg_skcipher_close_session( pr_err("virtio_crypto: Close session failed status: %u, session_id: 0x%llx\n", ctrl_status->status, destroy_session->session_id); - return -EINVAL; + err = -EINVAL; + goto out; } err = 0; -- cgit From c8e82e3877028381969779a86972d9a4f57a9ea0 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Fri, 25 Nov 2022 00:12:14 +0800 Subject: virtio: Implementing attribute show with sysfs_emit Replace sprintf with sysfs_emit or its variants for their built-in PAGE_SIZE awareness. Signed-off-by: Dawei Li Message-Id: Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 828ced060742..b9a80aedee1b 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -15,7 +15,7 @@ static ssize_t device_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); - return sprintf(buf, "0x%04x\n", dev->id.device); + return sysfs_emit(buf, "0x%04x\n", dev->id.device); } static DEVICE_ATTR_RO(device); @@ -23,7 +23,7 @@ static ssize_t vendor_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); - return sprintf(buf, "0x%04x\n", dev->id.vendor); + return sysfs_emit(buf, "0x%04x\n", dev->id.vendor); } static DEVICE_ATTR_RO(vendor); @@ -31,7 +31,7 @@ static ssize_t status_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); - return sprintf(buf, "0x%08x\n", dev->config->get_status(dev)); + return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev)); } static DEVICE_ATTR_RO(status); @@ -39,7 +39,7 @@ static ssize_t modalias_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); - return sprintf(buf, "virtio:d%08Xv%08X\n", + return sysfs_emit(buf, "virtio:d%08Xv%08X\n", dev->id.device, dev->id.vendor); } static DEVICE_ATTR_RO(modalias); @@ -54,9 +54,9 @@ static ssize_t features_show(struct device *_d, /* We actually represent this as a bitstring, as it could be * arbitrary length in future. */ for (i = 0; i < sizeof(dev->features)*8; i++) - len += sprintf(buf+len, "%c", + len += sysfs_emit_at(buf, len, "%c", __virtio_test_bit(dev, i) ? '1' : '0'); - len += sprintf(buf+len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); return len; } static DEVICE_ATTR_RO(features); -- cgit From e794070af224ade46db368271896b2685ff4f96b Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Mon, 19 Dec 2022 15:33:31 +0800 Subject: vhost_vdpa: fix the crash in unmap a large memory While testing in vIOMMU, sometimes Guest will unmap very large memory, which will cause the crash. To fix this, add a new function vhost_vdpa_general_unmap(). This function will only unmap the memory that saved in iotlb. Call Trace: [ 647.820144] ------------[ cut here ]------------ [ 647.820848] kernel BUG at drivers/iommu/intel/iommu.c:1174! [ 647.821486] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 647.822082] CPU: 10 PID: 1181 Comm: qemu-system-x86 Not tainted 6.0.0-rc1home_lulu_2452_lulu7_vhost+ #62 [ 647.823139] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-29-g6a62e0cb0dfe-prebuilt.qem4 [ 647.824365] RIP: 0010:domain_unmap+0x48/0x110 [ 647.825424] Code: 48 89 fb 8d 4c f6 1e 39 c1 0f 4f c8 83 e9 0c 83 f9 3f 7f 18 48 89 e8 48 d3 e8 48 85 c0 75 59 [ 647.828064] RSP: 0018:ffffae5340c0bbf0 EFLAGS: 00010202 [ 647.828973] RAX: 0000000000000001 RBX: ffff921793d10540 RCX: 000000000000001b [ 647.830083] RDX: 00000000080000ff RSI: 0000000000000001 RDI: ffff921793d10540 [ 647.831214] RBP: 0000000007fc0100 R08: ffffae5340c0bcd0 R09: 0000000000000003 [ 647.832388] R10: 0000007fc0100000 R11: 0000000000100000 R12: 00000000080000ff [ 647.833668] R13: ffffae5340c0bcd0 R14: ffff921793d10590 R15: 0000008000100000 [ 647.834782] FS: 00007f772ec90640(0000) GS:ffff921ce7a80000(0000) knlGS:0000000000000000 [ 647.836004] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 647.836990] CR2: 00007f02c27a3a20 CR3: 0000000101b0c006 CR4: 0000000000372ee0 [ 647.838107] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 647.839283] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 647.840666] Call Trace: [ 647.841437] [ 647.842107] intel_iommu_unmap_pages+0x93/0x140 [ 647.843112] __iommu_unmap+0x91/0x1b0 [ 647.844003] iommu_unmap+0x6a/0x95 [ 647.844885] vhost_vdpa_unmap+0x1de/0x1f0 [vhost_vdpa] [ 647.845985] vhost_vdpa_process_iotlb_msg+0xf0/0x90b [vhost_vdpa] [ 647.847235] ? _raw_spin_unlock+0x15/0x30 [ 647.848181] ? _copy_from_iter+0x8c/0x580 [ 647.849137] vhost_chr_write_iter+0xb3/0x430 [vhost] [ 647.850126] vfs_write+0x1e4/0x3a0 [ 647.850897] ksys_write+0x53/0xd0 [ 647.851688] do_syscall_64+0x3a/0x90 [ 647.852508] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 647.853457] RIP: 0033:0x7f7734ef9f4f [ 647.854408] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 29 76 f8 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c8 [ 647.857217] RSP: 002b:00007f772ec8f040 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [ 647.858486] RAX: ffffffffffffffda RBX: 00000000fef00000 RCX: 00007f7734ef9f4f [ 647.859713] RDX: 0000000000000048 RSI: 00007f772ec8f090 RDI: 0000000000000010 [ 647.860942] RBP: 00007f772ec8f1a0 R08: 0000000000000000 R09: 0000000000000000 [ 647.862206] R10: 0000000000000001 R11: 0000000000000293 R12: 0000000000000010 [ 647.863446] R13: 0000000000000002 R14: 0000000000000000 R15: ffffffff01100000 [ 647.864692] [ 647.865458] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs v] [ 647.874688] ---[ end trace 0000000000000000 ]--- Cc: stable@vger.kernel.org Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Cindy Lu Message-Id: <20221219073331.556140-1-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b08e07fc7d1f..ec32f785dfde 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -66,8 +66,8 @@ static DEFINE_IDA(vhost_vdpa_ida); static dev_t vhost_vdpa_major; static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, - struct vhost_iotlb *iotlb, - u64 start, u64 last); + struct vhost_iotlb *iotlb, u64 start, + u64 last, u32 asid); static inline u32 iotlb_to_asid(struct vhost_iotlb *iotlb) { @@ -139,7 +139,7 @@ static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid) return -EINVAL; hlist_del(&as->hash_link); - vhost_vdpa_iotlb_unmap(v, &as->iotlb, 0ULL, 0ULL - 1); + vhost_vdpa_iotlb_unmap(v, &as->iotlb, 0ULL, 0ULL - 1, asid); kfree(as); return 0; @@ -687,10 +687,20 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, mutex_unlock(&d->mutex); return r; } +static void vhost_vdpa_general_unmap(struct vhost_vdpa *v, + struct vhost_iotlb_map *map, u32 asid) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + if (ops->dma_map) { + ops->dma_unmap(vdpa, asid, map->start, map->size); + } else if (ops->set_map == NULL) { + iommu_unmap(v->domain, map->start, map->size); + } +} -static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, - struct vhost_iotlb *iotlb, - u64 start, u64 last) +static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, + u64 start, u64 last, u32 asid) { struct vhost_dev *dev = &v->vdev; struct vhost_iotlb_map *map; @@ -707,13 +717,13 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, unpin_user_page(page); } atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm); + vhost_vdpa_general_unmap(v, map, asid); vhost_iotlb_map_free(iotlb, map); } } -static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, - struct vhost_iotlb *iotlb, - u64 start, u64 last) +static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, + u64 start, u64 last, u32 asid) { struct vhost_iotlb_map *map; struct vdpa_map_file *map_file; @@ -722,20 +732,21 @@ static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, map_file = (struct vdpa_map_file *)map->opaque; fput(map_file->file); kfree(map_file); + vhost_vdpa_general_unmap(v, map, asid); vhost_iotlb_map_free(iotlb, map); } } static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, - struct vhost_iotlb *iotlb, - u64 start, u64 last) + struct vhost_iotlb *iotlb, u64 start, + u64 last, u32 asid) { struct vdpa_device *vdpa = v->vdpa; if (vdpa->use_va) - return vhost_vdpa_va_unmap(v, iotlb, start, last); + return vhost_vdpa_va_unmap(v, iotlb, start, last, asid); - return vhost_vdpa_pa_unmap(v, iotlb, start, last); + return vhost_vdpa_pa_unmap(v, iotlb, start, last, asid); } static int perm_to_iommu_flags(u32 perm) @@ -802,17 +813,12 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, const struct vdpa_config_ops *ops = vdpa->config; u32 asid = iotlb_to_asid(iotlb); - vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1); + vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1, asid); - if (ops->dma_map) { - ops->dma_unmap(vdpa, asid, iova, size); - } else if (ops->set_map) { + if (ops->set_map) { if (!v->in_batch) ops->set_map(vdpa, asid, iotlb); - } else { - iommu_unmap(v->domain, iova, size); } - /* If we are in the middle of batch processing, delay the free * of AS until BATCH_END. */ -- cgit From 8aeac42d60936046a00e67cdf7d27b061df2962f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 27 Nov 2022 19:43:46 -0800 Subject: tools/virtio: remove stray characters __read_once_size() is not a macro, remove those '/'s. Signed-off-by: Davidlohr Bueso Message-Id: <20221128034347.990-2-dave@stgolabs.net> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- tools/virtio/ringtest/main.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h index 6d1fccd3d86c..9ed09caa659e 100644 --- a/tools/virtio/ringtest/main.h +++ b/tools/virtio/ringtest/main.h @@ -149,16 +149,16 @@ static inline void busy_wait(void) static __always_inline void __read_once_size(const volatile void *p, void *res, int size) { - switch (size) { \ - case 1: *(unsigned char *)res = *(volatile unsigned char *)p; break; \ - case 2: *(unsigned short *)res = *(volatile unsigned short *)p; break; \ - case 4: *(unsigned int *)res = *(volatile unsigned int *)p; break; \ - case 8: *(unsigned long long *)res = *(volatile unsigned long long *)p; break; \ - default: \ - barrier(); \ - __builtin_memcpy((void *)res, (const void *)p, size); \ - barrier(); \ - } \ + switch (size) { + case 1: *(unsigned char *)res = *(volatile unsigned char *)p; break; + case 2: *(unsigned short *)res = *(volatile unsigned short *)p; break; + case 4: *(unsigned int *)res = *(volatile unsigned int *)p; break; + case 8: *(unsigned long long *)res = *(volatile unsigned long long *)p; break; + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + barrier(); + } } static __always_inline void __write_once_size(volatile void *p, void *res, int size) -- cgit From 81931012bd7dc52fadf2b720605fce8a7148d4a7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 27 Nov 2022 19:43:47 -0800 Subject: tools/virtio: remove smp_read_barrier_depends() This gets rid of the last references to smp_read_barrier_depends() which for the kernel side was removed in v5.9. The serialization required for Alpha is done inside READ_ONCE() instead of having users deal with it. Simply use a full barrier, the architecture does not have rmb in the first place. Signed-off-by: Davidlohr Bueso Message-Id: <20221128034347.990-3-dave@stgolabs.net> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- tools/virtio/ringtest/main.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h index 9ed09caa659e..b68920d52750 100644 --- a/tools/virtio/ringtest/main.h +++ b/tools/virtio/ringtest/main.h @@ -140,12 +140,6 @@ static inline void busy_wait(void) #define smp_wmb() smp_release() #endif -#ifdef __alpha__ -#define smp_read_barrier_depends() smp_acquire() -#else -#define smp_read_barrier_depends() do {} while(0) -#endif - static __always_inline void __read_once_size(const volatile void *p, void *res, int size) { @@ -175,13 +169,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s } } +#ifdef __alpha__ #define READ_ONCE(x) \ ({ \ union { typeof(x) __val; char __c[1]; } __u; \ __read_once_size(&(x), __u.__c, sizeof(x)); \ - smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ + smp_mb(); /* Enforce dependency ordering from x */ \ __u.__val; \ }) +#else +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) +#endif #define WRITE_ONCE(x, val) \ ({ \ -- cgit From 937c783aa3d8d77963ec91918d3298edb45b9161 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 28 Nov 2022 07:57:15 -0800 Subject: vduse: Validate vq_num in vduse_validate_config() Add a limit to 'config->vq_num' which is user controlled data which comes from an vduse_ioctl to prevent large memory allocations. Micheal says - This limit is somewhat arbitrary. However, currently virtio pci and ccw are limited to a 16 bit vq number. While MMIO isn't it is also isn't used with lots of VQs due to current lack of support for per-vq interrupts. Thus, the 0xffff limit on number of VQs corresponding to a 16-bit VQ number seems sufficient for now. This is found using static analysis with smatch. Suggested-by: Michael S. Tsirkin Signed-off-by: Harshit Mogalapalli Message-Id: <20221128155717.2579992-1-harshit.m.mogalapalli@oracle.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_user/vduse_dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0dd3c1f291da..0c3b48616a9f 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1440,6 +1440,9 @@ static bool vduse_validate_config(struct vduse_dev_config *config) if (config->config_size > PAGE_SIZE) return false; + if (config->vq_num > 0xffff) + return false; + if (!device_is_allowed(config->device_id)) return false; -- cgit From ed843d6ed7310a27cf7c8ee0a82a482eed0cb4a6 Mon Sep 17 00:00:00 2001 From: Rong Wang Date: Wed, 7 Dec 2022 20:08:13 +0800 Subject: vdpa/vp_vdpa: fix kfree a wrong pointer in vp_vdpa_remove In vp_vdpa_remove(), the code kfree(&vp_vdpa_mgtdev->mgtdev.id_table) uses a reference of pointer as the argument of kfree, which is the wrong pointer and then may hit crash like this: Unable to handle kernel paging request at virtual address 00ffff003363e30c Internal error: Oops: 96000004 [#1] SMP Call trace: rb_next+0x20/0x5c ext4_readdir+0x494/0x5c4 [ext4] iterate_dir+0x168/0x1b4 __se_sys_getdents64+0x68/0x170 __arm64_sys_getdents64+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180 Code: 54000220 f9400441 b4000161 aa0103e0 (f9400821) SMP: stopping secondary CPUs Starting crashdump kernel... Fixes: ffbda8e9df10 ("vdpa/vp_vdpa : add vdpa tool support in vp_vdpa") Signed-off-by: Rong Wang Signed-off-by: Nanyong Sun Message-Id: <20221207120813.2837529-1-sunnanyong@huawei.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Cindy Lu Acked-by: Jason Wang --- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index d448db0c4de3..8fe267ca3e76 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -647,7 +647,7 @@ static void vp_vdpa_remove(struct pci_dev *pdev) mdev = vp_vdpa_mgtdev->mdev; vp_modern_remove(mdev); vdpa_mgmtdev_unregister(&vp_vdpa_mgtdev->mgtdev); - kfree(&vp_vdpa_mgtdev->mgtdev.id_table); + kfree(vp_vdpa_mgtdev->mgtdev.id_table); kfree(mdev); kfree(vp_vdpa_mgtdev); } -- cgit From 1c96d5457f7251d1c62aacc04921557d56fc049a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 7 Sep 2022 14:01:10 +0800 Subject: vdpa: conditionally fill max max queue pair for stats For the device without multiqueue feature, we will read 0 as max_virtqueue_pairs from the config. So if we fill VDPA_ATTR_DEV_NET_CFG_MAX_VQP with the value we read from the config we will confuse the user. Fixing this by only filling the value when multiqueue is offered by the device so userspace can assume 1 when the attr is not provided. Fixes: 13b00b135665c("vdpa: Add support for querying vendor statistics") Cc: Eli Cohen Signed-off-by: Jason Wang Message-Id: <20220907060110.4511-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/vdpa/vdpa.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 41ed56362992..8ef7aa1365cc 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -935,7 +935,6 @@ static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, { struct virtio_net_config config = {}; u64 features; - u16 max_vqp; u8 status; int err; @@ -946,15 +945,15 @@ static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); - max_vqp = __virtio16_to_cpu(true, config.max_virtqueue_pairs); - if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp)) - return -EMSGSIZE; - features = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features, VDPA_ATTR_PAD)) return -EMSGSIZE; + err = vdpa_dev_net_mq_config_fill(msg, features, &config); + if (err) + return err; + if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) return -EMSGSIZE; -- cgit From 0b7a04a30eef20e6b24926a45c0ce7906ae85bd6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 13 Dec 2022 17:07:17 +0800 Subject: vdpasim: fix memory leak when freeing IOTLBs After commit bda324fd037a ("vdpasim: control virtqueue support"), vdpasim->iommu became an array of IOTLB, so we should clean the mappings of each free one by one instead of just deleting the ranges in the first IOTLB which may leak maps. Fixes: bda324fd037a ("vdpasim: control virtqueue support") Cc: Gautam Dawar Signed-off-by: Jason Wang Message-Id: <20221213090717.61529-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Gautam Dawar --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index b20689f8fe89..cb88891b44a8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -689,7 +689,9 @@ static void vdpasim_free(struct vdpa_device *vdpa) } kvfree(vdpasim->buffer); - vhost_iotlb_free(vdpasim->iommu); + for (i = 0; i < vdpasim->dev_attr.nas; i++) + vhost_iotlb_reset(&vdpasim->iommu[i]); + kfree(vdpasim->iommu); kfree(vdpasim->vqs); kfree(vdpasim->config); } -- cgit From 72455a1142527e607e1d69439f3ffa2ef6d09e26 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Wed, 14 Dec 2022 13:43:06 +0800 Subject: vdpa_sim_net: should not drop the multicast/broadcast packet In the receive_filter(), should not drop the packet with the broadcast/multicast address. Add the check for this Signed-off-by: Cindy Lu Message-Id: <20221214054306.24145-1-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 11f5a121df24..584b975a98a7 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -62,6 +62,9 @@ static bool receive_filter(struct vdpasim *vdpasim, size_t len) if (len < ETH_ALEN + hdr_len) return false; + if (is_broadcast_ether_addr(vdpasim->buffer + hdr_len) || + is_multicast_ether_addr(vdpasim->buffer + hdr_len)) + return true; if (!strncmp(vdpasim->buffer + hdr_len, vio_config->mac, ETH_ALEN)) return true; -- cgit From a26116c1e74028914f281851488546c91cbae57d Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Fri, 21 Oct 2022 17:41:26 -0300 Subject: virtio_blk: Fix signedness bug in virtblk_prep_rq() The virtblk_map_data() function returns negative error codes, however, the 'nents' field of vbr->sg_table is an unsigned int, which causes the error handling not to work correctly. Cc: stable@vger.kernel.org Fixes: 0e9911fa768f ("virtio-blk: support mq_ops->queue_rqs()") Signed-off-by: Rafael Mendonca Message-Id: <20221021204126.927603-1-rafaelmendsr@gmail.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Reviewed-by: Suwan Kim Reviewed-by: Stefan Hajnoczi Acked-by: Jason Wang --- drivers/block/virtio_blk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index dcbf86cd2155..6a77fa917428 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -334,14 +334,16 @@ static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, struct virtblk_req *vbr) { blk_status_t status; + int num; status = virtblk_setup_cmd(vblk->vdev, req, vbr); if (unlikely(status)) return status; - vbr->sg_table.nents = virtblk_map_data(hctx, req, vbr); - if (unlikely(vbr->sg_table.nents < 0)) + num = virtblk_map_data(hctx, req, vbr); + if (unlikely(num < 0)) return virtblk_fail_to_queue(req, -ENOMEM); + vbr->sg_table.nents = num; blk_mq_start_request(req); -- cgit From 9deb1e9fb88b1120a908676fa33bdf9e2eeaefce Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 26 Dec 2022 14:48:23 +0300 Subject: net/ethtool/ioctl: return -EOPNOTSUPP if we have no phy stats It's not very useful to copy back an empty ethtool_stats struct and return 0 if we didn't actually have any stats. This also allows for further simplification of this function in the future commits. Signed-off-by: Daniil Tatianin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index c2f1a542e6fa..932fa8225b2f 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2099,7 +2099,8 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; - WARN_ON_ONCE(!n_stats); + if (WARN_ON_ONCE(!n_stats)) + return -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; -- cgit From fd4778581d61d8848b532f8cdc9b325138748437 Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 26 Dec 2022 14:48:24 +0300 Subject: net/ethtool/ioctl: remove if n_stats checks from ethtool_get_phy_stats Now that we always early return if we don't have any stats we can remove these checks as they're no longer necessary. Signed-off-by: Daniil Tatianin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 932fa8225b2f..85f0cffdcec8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2107,28 +2107,24 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) stats.n_stats = n_stats; - if (n_stats) { - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; + data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!data) + return -ENOMEM; - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_stats) { - ret = phy_ops->get_stats(phydev, &stats, data); - if (ret < 0) - goto out; - } else { - ops->get_ethtool_phy_stats(dev, &stats, data); - } + if (phydev && !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_stats) { + ret = phy_ops->get_stats(phydev, &stats, data); + if (ret < 0) + goto out; } else { - data = NULL; + ops->get_ethtool_phy_stats(dev, &stats, data); } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) + if (copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; -- cgit From 201ed315f9676809cd5b20a39206e964106d4f27 Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 26 Dec 2022 14:48:25 +0300 Subject: net/ethtool/ioctl: split ethtool_get_phy_stats into multiple helpers So that it's easier to follow and make sense of the branching and various conditions. Stats retrieval has been split into two separate functions ethtool_get_phy_stats_phydev & ethtool_get_phy_stats_ethtool. The former attempts to retrieve the stats using phydev & phy_ops, while the latter uses ethtool_ops. Actual n_stats validation & array allocation has been moved into a new ethtool_vzalloc_stats_array helper. This also fixes a potential NULL dereference of ops->get_ethtool_phy_stats where it was getting called in an else branch unconditionally without making sure it was actually present. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Signed-off-by: Daniil Tatianin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 102 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 33 deletions(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 85f0cffdcec8..646b3e490c71 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2078,23 +2078,8 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } -static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +static int ethtool_vzalloc_stats_array(int n_stats, u64 **data) { - const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - struct ethtool_stats stats; - u64 *data; - int ret, n_stats; - - if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) - return -EOPNOTSUPP; - - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_sset_count) - n_stats = phy_ops->get_sset_count(phydev); - else - n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) @@ -2102,31 +2087,82 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (WARN_ON_ONCE(!n_stats)) return -EOPNOTSUPP; + *data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!*data) + return -ENOMEM; + + return 0; +} + +static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, + struct ethtool_stats *stats, + u64 **data) + { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; + int n_stats, ret; + + if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats) + return -EOPNOTSUPP; + + n_stats = phy_ops->get_sset_count(phydev); + + ret = ethtool_vzalloc_stats_array(n_stats, data); + if (ret) + return ret; + + stats->n_stats = n_stats; + return phy_ops->get_stats(phydev, stats, *data); +} + +static int ethtool_get_phy_stats_ethtool(struct net_device *dev, + struct ethtool_stats *stats, + u64 **data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int n_stats, ret; + + if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) + return -EOPNOTSUPP; + + n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); + + ret = ethtool_vzalloc_stats_array(n_stats, data); + if (ret) + return ret; + + stats->n_stats = n_stats; + ops->get_ethtool_phy_stats(dev, stats, *data); + + return 0; +} + +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + struct phy_device *phydev = dev->phydev; + struct ethtool_stats stats; + u64 *data = NULL; + int ret = -EOPNOTSUPP; + if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; - stats.n_stats = n_stats; + if (phydev) + ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data); - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; + if (ret == -EOPNOTSUPP) + ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data); - if (phydev && !ops->get_ethtool_phy_stats && - phy_ops && phy_ops->get_stats) { - ret = phy_ops->get_stats(phydev, &stats, data); - if (ret < 0) - goto out; - } else { - ops->get_ethtool_phy_stats(dev, &stats, data); - } + if (ret) + goto out; - ret = -EFAULT; - if (copy_to_user(useraddr, &stats, sizeof(stats))) + if (copy_to_user(useraddr, &stats, sizeof(stats))) { + ret = -EFAULT; goto out; + } + useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) - goto out; - ret = 0; + if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) + ret = -EFAULT; out: vfree(data); -- cgit From ad425666a1f05d9b215a84cf010c3789b2ea8206 Mon Sep 17 00:00:00 2001 From: Chunhao Lin Date: Mon, 26 Dec 2022 20:31:52 +0800 Subject: r8169: move rtl_wol_enable_rx() and rtl_prepare_power_down() There is no functional change. Moving these two functions for following patch "r8169: fix dmar pte write access is not set error". Signed-off-by: Chunhao Lin Reviewed-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 44 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index a9dcc98b6af1..acc2500342ca 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2210,28 +2210,6 @@ static int rtl_set_mac_address(struct net_device *dev, void *p) return 0; } -static void rtl_wol_enable_rx(struct rtl8169_private *tp) -{ - if (tp->mac_version >= RTL_GIGA_MAC_VER_25) - RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | - AcceptBroadcast | AcceptMulticast | AcceptMyPhys); -} - -static void rtl_prepare_power_down(struct rtl8169_private *tp) -{ - if (tp->dash_type != RTL_DASH_NONE) - return; - - if (tp->mac_version == RTL_GIGA_MAC_VER_32 || - tp->mac_version == RTL_GIGA_MAC_VER_33) - rtl_ephy_write(tp, 0x19, 0xff64); - - if (device_may_wakeup(tp_to_dev(tp))) { - phy_speed_down(tp->phydev, false); - rtl_wol_enable_rx(tp); - } -} - static void rtl_init_rxcfg(struct rtl8169_private *tp) { switch (tp->mac_version) { @@ -2455,6 +2433,28 @@ static void rtl_enable_rxdvgate(struct rtl8169_private *tp) rtl_wait_txrx_fifo_empty(tp); } +static void rtl_wol_enable_rx(struct rtl8169_private *tp) +{ + if (tp->mac_version >= RTL_GIGA_MAC_VER_25) + RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | + AcceptBroadcast | AcceptMulticast | AcceptMyPhys); +} + +static void rtl_prepare_power_down(struct rtl8169_private *tp) +{ + if (tp->dash_type != RTL_DASH_NONE) + return; + + if (tp->mac_version == RTL_GIGA_MAC_VER_32 || + tp->mac_version == RTL_GIGA_MAC_VER_33) + rtl_ephy_write(tp, 0x19, 0xff64); + + if (device_may_wakeup(tp_to_dev(tp))) { + phy_speed_down(tp->phydev, false); + rtl_wol_enable_rx(tp); + } +} + static void rtl_set_tx_config_registers(struct rtl8169_private *tp) { u32 val = TX_DMA_BURST << TxDMAShift | -- cgit From bb41c13c05c23d9bc46b4e37d8914078c6a40e3a Mon Sep 17 00:00:00 2001 From: Chunhao Lin Date: Mon, 26 Dec 2022 20:31:53 +0800 Subject: r8169: fix dmar pte write access is not set error When close device, if wol is enabled, rx will be enabled. When open device it will cause rx packet to be dma to the wrong memory address after pci_set_master() and system log will show blow messages. DMAR: DRHD: handling fault status reg 3 DMAR: [DMA Write] Request device [02:00.0] PASID ffffffff fault addr ffdd4000 [fault reason 05] PTE Write access is not set In this patch, driver disable tx/rx when close device. If wol is enabled, only enable rx filter and disable rxdv_gate(if support) to let hardware only receive packet to fifo but not to dma it. Signed-off-by: Chunhao Lin Reviewed-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index acc2500342ca..24592d972523 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2438,6 +2438,9 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp) if (tp->mac_version >= RTL_GIGA_MAC_VER_25) RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | AcceptBroadcast | AcceptMulticast | AcceptMyPhys); + + if (tp->mac_version >= RTL_GIGA_MAC_VER_40) + rtl_disable_rxdvgate(tp); } static void rtl_prepare_power_down(struct rtl8169_private *tp) @@ -3872,7 +3875,7 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp) netdev_reset_queue(tp->dev); } -static void rtl8169_cleanup(struct rtl8169_private *tp, bool going_down) +static void rtl8169_cleanup(struct rtl8169_private *tp) { napi_disable(&tp->napi); @@ -3884,9 +3887,6 @@ static void rtl8169_cleanup(struct rtl8169_private *tp, bool going_down) rtl_rx_close(tp); - if (going_down && tp->dev->wol_enabled) - goto no_reset; - switch (tp->mac_version) { case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: @@ -3907,7 +3907,7 @@ static void rtl8169_cleanup(struct rtl8169_private *tp, bool going_down) } rtl_hw_reset(tp); -no_reset: + rtl8169_tx_clear(tp); rtl8169_init_ring_indexes(tp); } @@ -3918,7 +3918,7 @@ static void rtl_reset_work(struct rtl8169_private *tp) netif_stop_queue(tp->dev); - rtl8169_cleanup(tp, false); + rtl8169_cleanup(tp); for (i = 0; i < NUM_RX_DESC; i++) rtl8169_mark_to_asic(tp->RxDescArray + i); @@ -4605,7 +4605,7 @@ static void rtl8169_down(struct rtl8169_private *tp) pci_clear_master(tp->pci_dev); rtl_pci_commit(tp); - rtl8169_cleanup(tp, true); + rtl8169_cleanup(tp); rtl_disable_exit_l1(tp); rtl_prepare_power_down(tp); } -- cgit From c2052189f19bd98c80b5d46dc6e42330d2b3b35d Mon Sep 17 00:00:00 2001 From: Xuezhi Zhang Date: Tue, 27 Dec 2022 19:03:52 +0800 Subject: s390/qeth: convert sysfs snprintf to sysfs_emit Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: Xuezhi Zhang Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_sys.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index 406be169173c..d1adc4b83193 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -410,13 +410,13 @@ static ssize_t qeth_dev_isolation_show(struct device *dev, switch (card->options.isolation) { case ISOLATION_MODE_NONE: - return snprintf(buf, 6, "%s\n", ATTR_QETH_ISOLATION_NONE); + return sysfs_emit(buf, "%s\n", ATTR_QETH_ISOLATION_NONE); case ISOLATION_MODE_FWD: - return snprintf(buf, 9, "%s\n", ATTR_QETH_ISOLATION_FWD); + return sysfs_emit(buf, "%s\n", ATTR_QETH_ISOLATION_FWD); case ISOLATION_MODE_DROP: - return snprintf(buf, 6, "%s\n", ATTR_QETH_ISOLATION_DROP); + return sysfs_emit(buf, "%s\n", ATTR_QETH_ISOLATION_DROP); default: - return snprintf(buf, 5, "%s\n", "N/A"); + return sysfs_emit(buf, "%s\n", "N/A"); } } @@ -500,9 +500,9 @@ static ssize_t qeth_hw_trap_show(struct device *dev, struct qeth_card *card = dev_get_drvdata(dev); if (card->info.hwtrap) - return snprintf(buf, 5, "arm\n"); + return sysfs_emit(buf, "arm\n"); else - return snprintf(buf, 8, "disarm\n"); + return sysfs_emit(buf, "disarm\n"); } static ssize_t qeth_hw_trap_store(struct device *dev, -- cgit From 40cab44b9089a41f71bbd0eff753eb91d5dafd68 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 27 Dec 2022 11:04:59 -0300 Subject: net/sched: fix retpoline wrapper compilation on configs without tc filters Rudi reports a compilation failure on x86_64 when CONFIG_NET_CLS or CONFIG_NET_CLS_ACT is not set but CONFIG_RETPOLINE is set. A misplaced '#endif' was causing the issue. Fixes: 7f0e810220e2 ("net/sched: add retpoline wrapper for tc") Tested-by: Rudi Heitbaum Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- include/net/tc_wrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index ceed2fc089ff..d323fffb839a 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -216,6 +216,8 @@ skip: return tp->classify(skb, tp, res); } +#endif /* CONFIG_NET_CLS */ + static inline void tc_wrapper_init(void) { #ifdef CONFIG_X86 @@ -224,8 +226,6 @@ static inline void tc_wrapper_init(void) #endif } -#endif /* CONFIG_NET_CLS */ - #else #define TC_INDIRECT_SCOPE static -- cgit From 090ddad4c7a9fefd647c762093a555870a19c8b2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 28 Dec 2022 13:57:14 +0100 Subject: ALSA: hda/hdmi: Static PCM mapping again with AMD HDMI codecs The recent code refactoring for HD-audio HDMI codec driver caused a regression on AMD/ATI HDMI codecs; namely, PulseAudioand pipewire don't recognize HDMI outputs any longer while the direct output via ALSA raw access still works. The problem turned out that, after the code refactoring, the driver assumes only the dynamic PCM assignment, and when a PCM stream that still isn't assigned to any pin gets opened, the driver tries to assign any free converter to the PCM stream. This behavior is OK for Intel and other codecs, as they have arbitrary connections between pins and converters. OTOH, on AMD chips that have a 1:1 mapping between pins and converters, this may end up with blocking the open of the next PCM stream for the pin that is tied with the formerly taken converter. Also, with the code refactoring, more PCM streams are exposed than necessary as we assume all converters can be used, while this isn't true for AMD case. This may change the PCM stream assignment and confuse users as well. This patch fixes those problems by: - Introducing a flag spec->static_pcm_mapping, and if it's set, the driver applies the static mapping between pins and converters at the probe time - Limiting the number of PCM streams per pins, too; this avoids the superfluous PCM streams Fixes: ef6f5494faf6 ("ALSA: hda/hdmi: Use only dynamic PCM device allocation") Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=216836 Co-developed-by: Jaroslav Kysela Signed-off-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20221228125714.16329-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 8015e4471267..386dd9d9143f 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -167,6 +167,7 @@ struct hdmi_spec { struct hdmi_ops ops; bool dyn_pin_out; + bool static_pcm_mapping; /* hdmi interrupt trigger control flag for Nvidia codec */ bool hdmi_intr_trig_ctrl; bool nv_dp_workaround; /* workaround DP audio infoframe for Nvidia */ @@ -1525,13 +1526,16 @@ static void update_eld(struct hda_codec *codec, */ pcm_jack = pin_idx_to_pcm_jack(codec, per_pin); - if (eld->eld_valid) { - hdmi_attach_hda_pcm(spec, per_pin); - hdmi_pcm_setup_pin(spec, per_pin); - } else { - hdmi_pcm_reset_pin(spec, per_pin); - hdmi_detach_hda_pcm(spec, per_pin); + if (!spec->static_pcm_mapping) { + if (eld->eld_valid) { + hdmi_attach_hda_pcm(spec, per_pin); + hdmi_pcm_setup_pin(spec, per_pin); + } else { + hdmi_pcm_reset_pin(spec, per_pin); + hdmi_detach_hda_pcm(spec, per_pin); + } } + /* if pcm_idx == -1, it means this is in monitor connection event * we can get the correct pcm_idx now. */ @@ -2281,8 +2285,8 @@ static int generic_hdmi_build_pcms(struct hda_codec *codec) struct hdmi_spec *spec = codec->spec; int idx, pcm_num; - /* limit the PCM devices to the codec converters */ - pcm_num = spec->num_cvts; + /* limit the PCM devices to the codec converters or available PINs */ + pcm_num = min(spec->num_cvts, spec->num_pins); codec_dbg(codec, "hdmi: pcm_num set to %d\n", pcm_num); for (idx = 0; idx < pcm_num; idx++) { @@ -2379,6 +2383,11 @@ static int generic_hdmi_build_controls(struct hda_codec *codec) struct hdmi_spec_per_pin *per_pin = get_pin(spec, pin_idx); struct hdmi_eld *pin_eld = &per_pin->sink_eld; + if (spec->static_pcm_mapping) { + hdmi_attach_hda_pcm(spec, per_pin); + hdmi_pcm_setup_pin(spec, per_pin); + } + pin_eld->eld_valid = false; hdmi_present_sense(per_pin, 0); } @@ -4419,6 +4428,8 @@ static int patch_atihdmi(struct hda_codec *codec) spec = codec->spec; + spec->static_pcm_mapping = true; + spec->ops.pin_get_eld = atihdmi_pin_get_eld; spec->ops.pin_setup_infoframe = atihdmi_pin_setup_infoframe; spec->ops.pin_hbr_setup = atihdmi_pin_hbr_setup; -- cgit From 8ca4fc323d2e4ab9dabbdd57633af40b0c7e6af9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2022 11:09:55 +0100 Subject: docs, nvme: add a feature and quirk policy document This adds a document about what specification features are supported by the Linux NVMe driver, and what qualifies for a quirk if an implementation has problems following the specification. Signed-off-by: Jens Axboe Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Randy Dunlap Acked-by: Jonathan Corbet --- .../maintainer/maintainer-entry-profile.rst | 1 + Documentation/nvme/feature-and-quirk-policy.rst | 77 ++++++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 79 insertions(+) create mode 100644 Documentation/nvme/feature-and-quirk-policy.rst diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index 93b2ae6c34a9..cfd37f31077f 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -104,3 +104,4 @@ to do something different in the near future. ../riscv/patch-acceptance ../driver-api/media/maintainer-entry-profile ../driver-api/vfio-pci-device-specific-driver-acceptance + ../nvme/feature-and-quirk-policy diff --git a/Documentation/nvme/feature-and-quirk-policy.rst b/Documentation/nvme/feature-and-quirk-policy.rst new file mode 100644 index 000000000000..c01d836d8e41 --- /dev/null +++ b/Documentation/nvme/feature-and-quirk-policy.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +Linux NVMe feature and and quirk policy +======================================= + +This file explains the policy used to decide what is supported by the +Linux NVMe driver and what is not. + + +Introduction +============ + +NVM Express is an open collection of standards and information. + +The Linux NVMe host driver in drivers/nvme/host/ supports devices +implementing the NVM Express (NVMe) family of specifications, which +currently consists of a number of documents: + + - the NVMe Base specification + - various Command Set specifications (e.g. NVM Command Set) + - various Transport specifications (e.g. PCIe, Fibre Channel, RDMA, TCP) + - the NVMe Management Interface specification + +See https://nvmexpress.org/developers/ for the NVMe specifications. + + +Supported features +================== + +NVMe is a large suite of specifications, and contains features that are only +useful or suitable for specific use-cases. It is important to note that Linux +does not aim to implement every feature in the specification. Every additional +feature implemented introduces more code, more maintenance and potentially more +bugs. Hence there is an inherent tradeoff between functionality and +maintainability of the NVMe host driver. + +Any feature implemented in the Linux NVMe host driver must support the +following requirements: + + 1. The feature is specified in a release version of an official NVMe + specification, or in a ratified Technical Proposal (TP) that is + available on NVMe website. Or if it is not directly related to the + on-wire protocol, does not contradict any of the NVMe specifications. + 2. Does not conflict with the Linux architecture, nor the design of the + NVMe host driver. + 3. Has a clear, indisputable value-proposition and a wide consensus across + the community. + +Vendor specific extensions are generally not supported in the NVMe host +driver. + +It is strongly recommended to work with the Linux NVMe and block layer +maintainers and get feedback on specification changes that are intended +to be used by the Linux NVMe host driver in order to avoid conflict at a +later stage. + + +Quirks +====== + +Sometimes implementations of open standards fail to correctly implement parts +of the standards. Linux uses identifier-based quirks to work around such +implementation bugs. The intent of quirks is to deal with widely available +hardware, usually consumer, which Linux users can't use without these quirks. +Typically these implementations are not or only superficially tested with Linux +by the hardware manufacturer. + +The Linux NVMe maintainers decide ad hoc whether to quirk implementations +based on the impact of the problem to Linux users and how it impacts +maintainability of the driver. In general quirks are a last resort, if no +firmware updates or other workarounds are available from the vendor. + +Quirks will not be added to the Linux kernel for hardware that isn't available +on the mass market. Hardware that fails qualification for enterprise Linux +distributions, ChromeOS, Android or other consumers of the Linux kernel +should be fixed before it is shipped instead of relying on Linux quirks. diff --git a/MAINTAINERS b/MAINTAINERS index bb77a3ed9d54..d53b3a6cdc67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14827,6 +14827,7 @@ L: linux-nvme@lists.infradead.org S: Supported W: http://git.infradead.org/nvme.git T: git://git.infradead.org/nvme.git +F: Documentation/nvme/ F: drivers/nvme/host/ F: drivers/nvme/common/ F: include/linux/nvme* -- cgit From 685e6311637e46f3212439ce2789f8a300e5050f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 10:30:45 +0100 Subject: nvme: fix the NVME_CMD_EFFECTS_CSE_MASK definition 3 << 16 does not generate the correct mask for bits 16, 17 and 18. Use the GENMASK macro to generate the correct mask instead. Fixes: 84fef62d135b ("nvme: check admin passthru command effects") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- include/linux/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d6be2a686100..d1cd53f2b6ab 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -7,6 +7,7 @@ #ifndef _LINUX_NVME_H #define _LINUX_NVME_H +#include #include #include @@ -639,7 +640,7 @@ enum { NVME_CMD_EFFECTS_NCC = 1 << 2, NVME_CMD_EFFECTS_NIC = 1 << 3, NVME_CMD_EFFECTS_CCC = 1 << 4, - NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, + NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, }; -- cgit From 61f37154c599cf9f2f84dcbd9be842f8645a7099 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2022 15:20:04 +0100 Subject: nvmet: use NVME_CMD_EFFECTS_CSUPP instead of open coding it Use NVME_CMD_EFFECTS_CSUPP instead of open coding it and assign a single value to multiple array entries instead of repeated assignments. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/target/admin-cmd.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 53a004ea320c..111a5cb6403f 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -164,26 +164,29 @@ out: static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) { - log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0); - - log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_get_log_page] = + log->acs[nvme_admin_identify] = + log->acs[nvme_admin_abort_cmd] = + log->acs[nvme_admin_set_features] = + log->acs[nvme_admin_get_features] = + log->acs[nvme_admin_async_event] = + log->acs[nvme_admin_keep_alive] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); + + log->iocs[nvme_cmd_read] = + log->iocs[nvme_cmd_write] = + log->iocs[nvme_cmd_flush] = + log->iocs[nvme_cmd_dsm] = + log->iocs[nvme_cmd_write_zeroes] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); } static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log) { - log->iocs[nvme_cmd_zone_append] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_zone_mgmt_send] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_zone_mgmt_recv] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_zone_append] = + log->iocs[nvme_cmd_zone_mgmt_send] = + log->iocs[nvme_cmd_zone_mgmt_recv] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); } static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) -- cgit From f2d1421391bba0b15684d2379a47a089f0e561d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2022 15:20:56 +0100 Subject: nvmet: set the LBCC bit for commands that modify data Write, Write Zeroes, Zone append and a Zone Reset through Zone Management Send modify the logical block content of a namespace, so make sure the LBCC bit is reported for them. Fixes: b5d0b38c0475 ("nvmet: add Command Set Identifier support") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/target/admin-cmd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 111a5cb6403f..6a54ed6fb121 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -174,17 +174,19 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); log->iocs[nvme_cmd_read] = - log->iocs[nvme_cmd_write] = log->iocs[nvme_cmd_flush] = log->iocs[nvme_cmd_dsm] = - log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); + log->iocs[nvme_cmd_write] = + log->iocs[nvme_cmd_write_zeroes] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC); } static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log) { log->iocs[nvme_cmd_zone_append] = log->iocs[nvme_cmd_zone_mgmt_send] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC); log->iocs[nvme_cmd_zone_mgmt_recv] = cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); } -- cgit From 2a459f6933e1c459bffb7cc73fd6c900edc714bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 09:51:19 +0100 Subject: nvmet: don't defer passthrough commands with trivial effects to the workqueue Mask out the "Command Supported" and "Logical Block Content Change" bits and only defer execution of commands that have non-trivial effects to the workqueue for synchronous execution. This allows to execute admin commands asynchronously on controllers that provide a Command Supported and Effects log page, and will keep allowing to execute Write commands asynchronously once command effects on I/O commands are taken into account. Fixes: c1fef73f793b ("nvmet: add passthru code to process commands") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- drivers/nvme/target/passthru.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 79af5140af8b..adc0958755d6 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -334,14 +334,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } /* - * If there are effects for the command we are about to execute, or - * an end_req function we need to use nvme_execute_passthru_rq() - * synchronously in a work item seeing the end_req function and - * nvme_passthru_end() can't be called in the request done callback - * which is typically in interrupt context. + * If a command needs post-execution fixups, or there are any + * non-trivial effects, make sure to execute the command synchronously + * in a workqueue so that nvme_passthru_end gets called. */ effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode); - if (req->p.use_workqueue || effects) { + if (req->p.use_workqueue || + (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))) { INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work); req->p.rq = rq; queue_work(nvmet_wq, &req->p.work); -- cgit From 831ed60c2aca2d7c517b2da22897a90224a97d27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 10:12:17 +0100 Subject: nvme: also return I/O command effects from nvme_command_effects To be able to use the Commands Supported and Effects Log for allowing unprivileged passtrough, it needs to be corretly reported for I/O commands as well. Return the I/O command effects from nvme_command_effects, and also add a default list of effects for the NVM command set. For other command sets, the Commands Supported and Effects log is required to be present already. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi --- drivers/nvme/host/core.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cda1361e6d4f..d307ae4d8a57 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1074,6 +1074,18 @@ static u32 nvme_known_admin_effects(u8 opcode) return 0; } +static u32 nvme_known_nvm_effects(u8 opcode) +{ + switch (opcode) { + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + case nvme_cmd_write_uncor: + return NVME_CMD_EFFECTS_LBCC; + default: + return 0; + } +} + u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) { u32 effects = 0; @@ -1081,16 +1093,24 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) if (ns) { if (ns->head->effects) effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (ns->head->ids.csi == NVME_CAP_CSS_NVM) + effects |= nvme_known_nvm_effects(opcode); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn_once(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", + "IO command:%02x has unusual effects:%08x\n", opcode, effects); - return 0; - } - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); + /* + * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues, + * which would deadlock when done on an I/O command. Note that + * We already warn about an unusual effect above. + */ + effects &= ~NVME_CMD_EFFECTS_CSE_MASK; + } else { + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + } return effects; } -- cgit From 6f99ac04c469b5d0a180a4ccea99d25d5dc9d21c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Dec 2022 16:13:38 +0100 Subject: nvme: consult the CSE log page for unprivileged passthrough Commands like Write Zeros can change the contents of a namespaces without actually transferring data. To protect against this, check the Commands Supported and Effects log is supported by the controller for any unprivileg command passthrough and refuse unprivileged passthrough if the command has any effects that can change data or metadata. Note: While the Commands Support and Effects log page has only been mandatory since NVMe 2.0, it is widely supported because Windows requires it for any command passthrough from userspace. Fixes: e4fbcf32c860 ("nvme: identify-namespace without CAP_SYS_ADMIN") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- drivers/nvme/host/ioctl.c | 28 ++++++++++++++++++++++++---- include/linux/nvme.h | 1 + 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9ddda571f046..a8639919237e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -11,6 +11,8 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, fmode_t mode) { + u32 effects; + if (capable(CAP_SYS_ADMIN)) return true; @@ -43,11 +45,29 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, } /* - * Only allow I/O commands that transfer data to the controller if the - * special file is open for writing, but always allow I/O commands that - * transfer data from the controller. + * Check if the controller provides a Commands Supported and Effects log + * and marks this command as supported. If not reject unprivileged + * passthrough. + */ + effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); + if (!(effects & NVME_CMD_EFFECTS_CSUPP)) + return false; + + /* + * Don't allow passthrough for command that have intrusive (or unknown) + * effects. + */ + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_UUID_SEL | + NVME_CMD_EFFECTS_SCOPE_MASK)) + return false; + + /* + * Only allow I/O commands that transfer data to the controller or that + * change the logical block contents if the file descriptor is open for + * writing. */ - if (nvme_is_write(c)) + if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) return mode & FMODE_WRITE; return true; } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d1cd53f2b6ab..4fad4aa245fb 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -642,6 +642,7 @@ enum { NVME_CMD_EFFECTS_CCC = 1 << 4, NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, + NVME_CMD_EFFECTS_SCOPE_MASK = GENMASK(31, 20), }; struct nvme_effects_log { -- cgit From 76807fcd73b818eb9f245ef1035aed34ecdd9813 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 25 Dec 2022 13:28:51 +0200 Subject: nvme-auth: fix smatch warning complaints When initializing auth context, there may be no secrets passed by the user. Make return code explicit when returning successfully. smatch warnings: drivers/nvme/host/auth.c:950 nvme_auth_init_ctrl() warn: missing error code? 'ret' Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index bb0abbe4491c..4424f53a8a0a 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -953,7 +953,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) goto err_free_dhchap_secret; if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) - return ret; + return 0; ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), sizeof(*chap), GFP_KERNEL); -- cgit From 1f0ae22ab470946143485a02cc1cd7e05c0f9120 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 12 Dec 2022 10:42:15 +0200 Subject: net/mlx5: E-Switch, properly handle ingress tagged packets on VST Fix SRIOV VST mode behavior to insert cvlan when a guest tag is already present in the frame. Previous VST mode behavior was to drop packets or override existing tag, depending on the device version. In this patch we fix this behavior by correctly building the HW steering rule with a push vlan action, or for older devices we ask the FW to stack the vlan when a vlan is already present. Fixes: 07bab9502641 ("net/mlx5: E-Switch, Refactor eswitch ingress acl codes") Fixes: dfcb1ed3c331 ("net/mlx5: E-Switch, Vport ingress/egress ACLs rules for VST mode") Signed-off-by: Moshe Shemesh Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/esw/acl/egress_lgcy.c | 7 ++++- .../mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 33 ++++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 30 ++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 6 ++++ include/linux/mlx5/device.h | 5 ++++ include/linux/mlx5/mlx5_ifc.h | 3 +- 6 files changed, 68 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 60a73990017c..6b4c9ffad95b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -67,6 +67,7 @@ static void esw_acl_egress_lgcy_groups_destroy(struct mlx5_vport *vport) int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + bool vst_mode_steering = esw_vst_mode_is_steering(esw); struct mlx5_flow_destination drop_ctr_dst = {}; struct mlx5_flow_destination *dst = NULL; struct mlx5_fc *drop_counter = NULL; @@ -77,6 +78,7 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, */ int table_size = 2; int dest_num = 0; + int actions_flag; int err = 0; if (vport->egress.legacy.drop_counter) { @@ -119,8 +121,11 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, vport->vport, vport->info.vlan, vport->info.qos); /* Allowed vlan rule */ + actions_flag = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + if (vst_mode_steering) + actions_flag |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; err = esw_egress_acl_vlan_create(esw, vport, NULL, vport->info.vlan, - MLX5_FLOW_CONTEXT_ACTION_ALLOW); + actions_flag); if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index b1a5199260f6..093ed86a0acd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -139,11 +139,14 @@ static void esw_acl_ingress_lgcy_groups_destroy(struct mlx5_vport *vport) int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + bool vst_mode_steering = esw_vst_mode_is_steering(esw); struct mlx5_flow_destination drop_ctr_dst = {}; struct mlx5_flow_destination *dst = NULL; struct mlx5_flow_act flow_act = {}; struct mlx5_flow_spec *spec = NULL; struct mlx5_fc *counter = NULL; + bool vst_check_cvlan = false; + bool vst_push_cvlan = false; /* The ingress acl table contains 4 groups * (2 active rules at the same time - * 1 allow rule from one of the first 3 groups. @@ -203,7 +206,26 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, goto out; } - if (vport->info.vlan || vport->info.qos) + if ((vport->info.vlan || vport->info.qos)) { + if (vst_mode_steering) + vst_push_cvlan = true; + else if (!MLX5_CAP_ESW(esw->dev, vport_cvlan_insert_always)) + vst_check_cvlan = true; + } + + if (vst_check_cvlan || vport->info.spoofchk) + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + /* Create ingress allow rule */ + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + if (vst_push_cvlan) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; + flow_act.vlan[0].prio = vport->info.qos; + flow_act.vlan[0].vid = vport->info.vlan; + flow_act.vlan[0].ethtype = ETH_P_8021Q; + } + + if (vst_check_cvlan) MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); @@ -218,9 +240,6 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, ether_addr_copy(smac_v, vport->info.mac); } - /* Create ingress allow rule */ - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, &flow_act, NULL, 0); if (IS_ERR(vport->ingress.allow_rule)) { @@ -232,6 +251,9 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, goto out; } + if (!vst_check_cvlan && !vport->info.spoofchk) + goto out; + memset(&flow_act, 0, sizeof(flow_act)); flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; /* Attach drop flow counter */ @@ -257,7 +279,8 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, return 0; out: - esw_acl_ingress_lgcy_cleanup(esw, vport); + if (err) + esw_acl_ingress_lgcy_cleanup(esw, vport); kvfree(spec); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 527e4bffda8d..0dfd5742c6fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -161,10 +161,17 @@ static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u16 vport, esw_vport_context.vport_cvlan_strip, 1); if (set_flags & SET_VLAN_INSERT) { - /* insert only if no vlan in packet */ - MLX5_SET(modify_esw_vport_context_in, in, - esw_vport_context.vport_cvlan_insert, 1); - + if (MLX5_CAP_ESW(dev, vport_cvlan_insert_always)) { + /* insert either if vlan exist in packet or not */ + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.vport_cvlan_insert, + MLX5_VPORT_CVLAN_INSERT_ALWAYS); + } else { + /* insert only if no vlan in packet */ + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.vport_cvlan_insert, + MLX5_VPORT_CVLAN_INSERT_WHEN_NO_CVLAN); + } MLX5_SET(modify_esw_vport_context_in, in, esw_vport_context.cvlan_pcp, qos); MLX5_SET(modify_esw_vport_context_in, in, @@ -809,6 +816,7 @@ out_free: static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { + bool vst_mode_steering = esw_vst_mode_is_steering(esw); u16 vport_num = vport->vport; int flags; int err; @@ -839,8 +847,9 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) flags = (vport->info.vlan || vport->info.qos) ? SET_VLAN_STRIP | SET_VLAN_INSERT : 0; - modify_esw_vport_cvlan(esw->dev, vport_num, vport->info.vlan, - vport->info.qos, flags); + if (esw->mode == MLX5_ESWITCH_OFFLOADS || !vst_mode_steering) + modify_esw_vport_cvlan(esw->dev, vport_num, vport->info.vlan, + vport->info.qos, flags); return 0; @@ -1848,6 +1857,7 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, u16 vport, u16 vlan, u8 qos, u8 set_flags) { struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + bool vst_mode_steering = esw_vst_mode_is_steering(esw); int err = 0; if (IS_ERR(evport)) @@ -1855,9 +1865,11 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, if (vlan > 4095 || qos > 7) return -EINVAL; - err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags); - if (err) - return err; + if (esw->mode == MLX5_ESWITCH_OFFLOADS || !vst_mode_steering) { + err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags); + if (err) + return err; + } evport->info.vlan = vlan; evport->info.qos = qos; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 5a85a5d32be7..92644fbb5081 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -527,6 +527,12 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw, int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, u16 vport, u16 vlan, u8 qos, u8 set_flags); +static inline bool esw_vst_mode_is_steering(struct mlx5_eswitch *esw) +{ + return (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, pop_vlan) && + MLX5_CAP_ESW_INGRESS_ACL(esw->dev, push_vlan)); +} + static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev, u8 vlan_depth) { diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5fe5d198b57a..29d4b201c7b2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1090,6 +1090,11 @@ enum { MLX5_VPORT_ADMIN_STATE_AUTO = 0x2, }; +enum { + MLX5_VPORT_CVLAN_INSERT_WHEN_NO_CVLAN = 0x1, + MLX5_VPORT_CVLAN_INSERT_ALWAYS = 0x3, +}; + enum { MLX5_L3_PROT_TYPE_IPV4 = 0, MLX5_L3_PROT_TYPE_IPV6 = 1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3d1c62c98dd..a9ee7bc59c90 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -913,7 +913,8 @@ struct mlx5_ifc_e_switch_cap_bits { u8 vport_svlan_insert[0x1]; u8 vport_cvlan_insert_if_not_exist[0x1]; u8 vport_cvlan_insert_overwrite[0x1]; - u8 reserved_at_5[0x2]; + u8 reserved_at_5[0x1]; + u8 vport_cvlan_insert_always[0x1]; u8 esw_shared_ingress_acl[0x1]; u8 esw_uplink_ingress_acl[0x1]; u8 root_ft_on_other_esw[0x1]; -- cgit From 2a35b2c2e6a252eda2134aae6a756861d9299531 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 18 Oct 2022 12:51:52 +0200 Subject: net/mlx5: Add forgotten cleanup calls into mlx5_init_once() error path There are two cleanup calls missing in mlx5_init_once() error path. Add them making the error path flow to be the same as mlx5_cleanup_once(). Fixes: 52ec462eca9b ("net/mlx5: Add reserved-gids support") Fixes: 7c39afb394c7 ("net/mlx5: PTP code migration to driver core section") Signed-off-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7f5db13e3550..ec5652f31dda 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1050,6 +1050,8 @@ err_rl_cleanup: err_tables_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); + mlx5_cleanup_clock(dev); + mlx5_cleanup_reserved_gids(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_fw_reset_cleanup(dev); err_events_cleanup: -- cgit From 44aee8ea15ac205490a41b00cbafcccbf9f7f82b Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 18 Dec 2022 12:42:14 +0200 Subject: net/mlx5: Fix io_eq_size and event_eq_size params validation io_eq_size and event_eq_size params are of param type DEVLINK_PARAM_TYPE_U32. But, the validation callback is addressing them as DEVLINK_PARAM_TYPE_U16. This cause mismatch in validation in big-endian systems, in which values in range were rejected while 268500991 was accepted. Fix it by checking the U32 value in the validation callback. Fixes: 0844fa5f7b89 ("net/mlx5: Let user configure io_eq_size param") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index ddb197970c22..be59bb35d795 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -563,7 +563,7 @@ static int mlx5_devlink_eq_depth_validate(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack) { - return (val.vu16 >= 64 && val.vu16 <= 4096) ? 0 : -EINVAL; + return (val.vu32 >= 64 && val.vu32 <= 4096) ? 0 : -EINVAL; } static const struct devlink_param mlx5_devlink_params[] = { -- cgit From 9078e843efec530f279a155f262793c58b0746bd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 24 Nov 2022 13:34:12 +0200 Subject: net/mlx5: Avoid recovery in probe flows Currently, recovery is done without considering whether the device is still in probe flow. This may lead to recovery before device have finished probed successfully. e.g.: while mlx5_init_one() is running. Recovery flow is using functionality that is loaded only by mlx5_init_one(), and there is no point in running recovery without mlx5_init_one() finished successfully. Fix it by waiting for probe flow to finish and checking whether the device is probed before trying to perform recovery. Fixes: 51d138c2610a ("net/mlx5: Fix health error state handling") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 86ed87d704f7..96417c5feed7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -674,6 +674,12 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) dev = container_of(priv, struct mlx5_core_dev, priv); devlink = priv_to_devlink(dev); + mutex_lock(&dev->intf_state_mutex); + if (test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) { + mlx5_core_err(dev, "health works are not permitted at this stage\n"); + return; + } + mutex_unlock(&dev->intf_state_mutex); enter_error_state(dev, false); if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { devl_lock(devlink); -- cgit From c4ad5f2bdad56265b23d3635494ecdb205431807 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 9 Nov 2022 14:42:59 +0200 Subject: net/mlx5: Fix RoCE setting at HCA level mlx5 PF can disable RoCE for its VFs and SFs. In such case RoCE is marked as unsupported on those VFs/SFs. The cited patch added an option for disable (and enable) RoCE at HCA level. However, that commit didn't check whether RoCE is supported on the HCA and enabled user to try and set RoCE to on. Fix it by checking whether the HCA supports RoCE. Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index be59bb35d795..5bd83c0275f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -468,7 +468,7 @@ static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id, bool new_state = val.vbool; if (new_state && !MLX5_CAP_GEN(dev, roce) && - !MLX5_CAP_GEN(dev, roce_rw_supported)) { + !(MLX5_CAP_GEN(dev, roce_rw_supported) && MLX5_CAP_GEN_MAX(dev, roce))) { NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ec5652f31dda..df134f6d32dc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -613,7 +613,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) MLX5_SET(cmd_hca_cap, set_hca_cap, num_total_dynamic_vf_msix, MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)); - if (MLX5_CAP_GEN(dev, roce_rw_supported)) + if (MLX5_CAP_GEN(dev, roce_rw_supported) && MLX5_CAP_GEN_MAX(dev, roce)) MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_on(dev)); -- cgit From b12d581e83e3ae1080c32ab83f123005bd89a840 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 28 Nov 2022 15:24:21 +0200 Subject: net/mlx5e: IPoIB, Don't allow CQE compression to be turned on by default mlx5e_build_nic_params will turn CQE compression on if the hardware capability is enabled and the slow_pci_heuristic condition is detected. As IPoIB doesn't support CQE compression, make sure to disable the feature in the IPoIB profile init. Please note that the feature is not exposed to the user for IPoIB interfaces, so it can't be subsequently turned on. Fixes: b797a684b0dd ("net/mlx5e: Enable CQE compression when PCI is slower than link") Signed-off-by: Dragos Tatulea Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 7c5c500fd215..2c73c8445e63 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -71,6 +71,10 @@ static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev, params->packet_merge.type = MLX5E_PACKET_MERGE_NONE; params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN; params->tunneled_offload_en = false; + + /* CQE compression is not supported for IPoIB */ + params->rx_cqe_compress_def = false; + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def); } /* Called directly after IPoIB netdevice was created to initialize SW structs */ -- cgit From f8c18a5749cf917096f75dd59885b7a0fe9298ba Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 27 Nov 2022 09:21:28 +0200 Subject: net/mlx5e: Fix RX reporter for XSK RQs RX reporter mistakenly reads from the regular (inactive) RQ when XSK RQ is active. Fix it here. Fixes: 3db4c85cde7a ("net/mlx5e: xsk: Use queue indices starting from 0 for XSK queues") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 5f6f95ad6888..1ae15b8536a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -459,7 +459,11 @@ static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, goto unlock; for (i = 0; i < priv->channels.num; i++) { - struct mlx5e_rq *rq = &priv->channels.c[i]->rq; + struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5e_rq *rq; + + rq = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state) ? + &c->xskrq : &c->rq; err = mlx5e_rx_reporter_build_diagnose_output(rq, fmsg); if (err) -- cgit From 849190e3e4ccf452fbe2240eace30a9ca83fb8d2 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Mon, 28 Nov 2022 13:54:29 +0800 Subject: net/mlx5e: CT: Fix ct debugfs folder name Need to use sprintf to build a string instead of sscanf. Otherwise dirname is null and both "ct_nic" and "ct_fdb" won't be created. But its redundant anyway as driver could be in switchdev mode but still add nic rules. So use "ct" as folder name. Fixes: 77422a8f6f61 ("net/mlx5e: CT: Add ct driver counters") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index a69849e0deed..313df8232db7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -2103,14 +2103,9 @@ out_err: static void mlx5_ct_tc_create_dbgfs(struct mlx5_tc_ct_priv *ct_priv) { - bool is_fdb = ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB; struct mlx5_tc_ct_debugfs *ct_dbgfs = &ct_priv->debugfs; - char dirname[16] = {}; - if (sscanf(dirname, "ct_%s", is_fdb ? "fdb" : "nic") < 0) - return; - - ct_dbgfs->root = debugfs_create_dir(dirname, mlx5_debugfs_get_dev_root(ct_priv->dev)); + ct_dbgfs->root = debugfs_create_dir("ct", mlx5_debugfs_get_dev_root(ct_priv->dev)); debugfs_create_atomic_t("offloaded", 0400, ct_dbgfs->root, &ct_dbgfs->stats.offloaded); debugfs_create_atomic_t("rx_dropped", 0400, ct_dbgfs->root, -- cgit From 2951b2e142ecf6e0115df785ba91e91b6da74602 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Mon, 5 Dec 2022 09:22:50 +0800 Subject: net/mlx5e: Always clear dest encap in neigh-update-del The cited commit introduced a bug for multiple encapsulations flow. If one dest encap becomes invalid, the flow is set slow path flag. But when other dests encap become invalid, they are not cleared due to slow path flag of the flow. When neigh-update-add is running, it will use invalid encap. Fix it by checking slow path flag after clearing dest encap. Fixes: 9a5f9cc794e1 ("net/mlx5e: Fix possible use-after-free deleting fdb rule") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index ff73d25bc6eb..2aaf8ab857b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -222,7 +222,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, int err; list_for_each_entry(flow, flow_list, tmp_list) { - if (!mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, SLOW)) + if (!mlx5e_is_offloaded_flow(flow)) continue; attr = mlx5e_tc_get_encap_attr(flow); @@ -231,6 +231,13 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID; esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL; + /* Clear pkt_reformat before checking slow path flag. Because + * in next iteration, the same flow is already set slow path + * flag, but still need to clear the pkt_reformat. + */ + if (flow_flag_test(flow, SLOW)) + continue; + /* update from encap rule to slow path rule */ spec = &flow->attr->parse_attr->spec; rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec); -- cgit From 1e267ab88dc44c48f556218f7b7f14c76f7aa066 Mon Sep 17 00:00:00 2001 From: Adham Faris Date: Wed, 14 Dec 2022 16:02:57 +0200 Subject: net/mlx5e: Fix hw mtu initializing at XDP SQ allocation Current xdp xmit functions logic (mlx5e_xmit_xdp_frame_mpwqe or mlx5e_xmit_xdp_frame), validates xdp packet length by comparing it to hw mtu (configured at xdp sq allocation) before xmiting it. This check does not account for ethernet fcs length (calculated and filled by the nic). Hence, when we try sending packets with length > (hw-mtu - ethernet-fcs-size), the device port drops it and tx_errors_phy is incremented. Desired behavior is to catch these packets and drop them by the driver. Fix this behavior in XDP SQ allocation function (mlx5e_alloc_xdpsq) by subtracting ethernet FCS header size (4 Bytes) from current hw mtu value, since ethernet FCS is calculated and written to ethernet frames by the nic. Fixes: d8bec2b29a82 ("net/mlx5e: Support bpf_xdp_adjust_head()") Signed-off-by: Adham Faris Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 8d36e2de53a9..cff5f2e29e1e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1305,7 +1305,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->channel = c; sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; - sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN; sq->xsk_pool = xsk_pool; sq->stats = sq->xsk_pool ? -- cgit From e54638a8380bd9c146a883035fffd0a821813682 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Sun, 1 Aug 2021 14:45:17 +0300 Subject: net/mlx5e: Set geneve_tlv_option_0_exist when matching on geneve option The cited patch added support of matching on geneve option by setting geneve_tlv_option_0_data mask and key but didn't set geneve_tlv_option_0_exist bit which is required on some HWs when matching geneve_tlv_option_0_data parameter, this may cause in some cases for packets to wrongly match on rules with different geneve option. Example of such case is packet with geneve_tlv_object class=789 and data=456 will wrongly match on rule with match geneve_tlv_object class=123 and data=456. Fix it by setting geneve_tlv_option_0_exist bit when supported by the HW when matching on geneve_tlv_option_0_data parameter. Fixes: 9272e3df3023 ("net/mlx5e: Geneve, Add support for encap/decap flows offload") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c index f5b26f5a7de4..054d80c4e65c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c @@ -273,6 +273,11 @@ static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv, geneve_tlv_option_0_data, be32_to_cpu(opt_data_key)); MLX5_SET(fte_match_set_misc3, misc_3_c, geneve_tlv_option_0_data, be32_to_cpu(opt_data_mask)); + if (MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.geneve_tlv_option_0_exist)) { + MLX5_SET_TO_ONES(fte_match_set_misc, misc_c, geneve_tlv_option_0_exist); + MLX5_SET_TO_ONES(fte_match_set_misc, misc_v, geneve_tlv_option_0_exist); + } spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3; -- cgit From 4d1c1379d71777ddeda3e54f8fc26e9ecbfd1009 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 15 Dec 2022 14:28:34 +0200 Subject: net/mlx5: Lag, fix failure to cancel delayed bond work Commit 0d4e8ed139d8 ("net/mlx5: Lag, avoid lockdep warnings") accidentally removed a call to cancel delayed bond work thus it may cause queued delay to expire and fall on an already destroyed work queue. Fix by restoring the call cancel_delayed_work_sync() before destroying the workqueue. This prevents call trace such as this: [ 329.230417] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 329.231444] #PF: supervisor write access in kernel mode [ 329.232233] #PF: error_code(0x0002) - not-present page [ 329.233007] PGD 0 P4D 0 [ 329.233476] Oops: 0002 [#1] SMP [ 329.234012] CPU: 5 PID: 145 Comm: kworker/u20:4 Tainted: G OE 6.0.0-rc5_mlnx #1 [ 329.235282] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 329.236868] Workqueue: mlx5_cmd_0000:08:00.1 cmd_work_handler [mlx5_core] [ 329.237886] RIP: 0010:_raw_spin_lock+0xc/0x20 [ 329.238585] Code: f0 0f b1 17 75 02 f3 c3 89 c6 e9 6f 3c 5f ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 31 c0 ba 01 00 00 00 0f b1 17 75 02 f3 c3 89 c6 e9 45 3c 5f ff 0f 1f 44 00 00 0f 1f [ 329.241156] RSP: 0018:ffffc900001b0e98 EFLAGS: 00010046 [ 329.241940] RAX: 0000000000000000 RBX: ffffffff82374ae0 RCX: 0000000000000000 [ 329.242954] RDX: 0000000000000001 RSI: 0000000000000014 RDI: 0000000000000000 [ 329.243974] RBP: ffff888106ccf000 R08: ffff8881004000c8 R09: ffff888100400000 [ 329.244990] R10: 0000000000000000 R11: ffffffff826669f8 R12: 0000000000002000 [ 329.246009] R13: 0000000000000005 R14: ffff888100aa7ce0 R15: ffff88852ca80000 [ 329.247030] FS: 0000000000000000(0000) GS:ffff88852ca80000(0000) knlGS:0000000000000000 [ 329.248260] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 329.249111] CR2: 0000000000000000 CR3: 000000016d675001 CR4: 0000000000770ee0 [ 329.250133] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 329.251152] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 329.252176] PKRU: 55555554 Fixes: 0d4e8ed139d8 ("net/mlx5: Lag, avoid lockdep warnings") Signed-off-by: Eli Cohen Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 32c3e0a649a7..ad32b80e8501 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -228,6 +228,7 @@ static void mlx5_ldev_free(struct kref *ref) if (ldev->nb.notifier_call) unregister_netdevice_notifier_net(&init_net, &ldev->nb); mlx5_lag_mp_cleanup(ldev); + cancel_delayed_work_sync(&ldev->bond_work); destroy_workqueue(ldev->wq); mlx5_lag_mpesw_cleanup(ldev); mutex_destroy(&ldev->lock); -- cgit From 9ed1d9aeef5842ecacb660fce933613b58af1e00 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Sat, 24 Dec 2022 21:31:46 +0800 Subject: bpf: Fix panic due to wrong pageattr of im->image In the scenario where livepatch and kretfunc coexist, the pageattr of im->image is rox after arch_prepare_bpf_trampoline in bpf_trampoline_update, and then modify_fentry or register_fentry returns -EAGAIN from bpf_tramp_ftrace_ops_func, the BPF_TRAMP_F_ORIG_STACK flag will be configured, and arch_prepare_bpf_trampoline will be re-executed. At this time, because the pageattr of im->image is rox, arch_prepare_bpf_trampoline will read and write im->image, which causes a fault. as follows: insmod livepatch-sample.ko # samples/livepatch/livepatch-sample.c bpftrace -e 'kretfunc:cmdline_proc_show {}' BUG: unable to handle page fault for address: ffffffffa0206000 PGD 322d067 P4D 322d067 PUD 322e063 PMD 1297e067 PTE d428061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 2 PID: 270 Comm: bpftrace Tainted: G E K 6.1.0 #5 RIP: 0010:arch_prepare_bpf_trampoline+0xed/0x8c0 RSP: 0018:ffffc90001083ad8 EFLAGS: 00010202 RAX: ffffffffa0206000 RBX: 0000000000000020 RCX: 0000000000000000 RDX: ffffffffa0206001 RSI: ffffffffa0206000 RDI: 0000000000000030 RBP: ffffc90001083b70 R08: 0000000000000066 R09: ffff88800f51b400 R10: 000000002e72c6e5 R11: 00000000d0a15080 R12: ffff8880110a68c8 R13: 0000000000000000 R14: ffff88800f51b400 R15: ffffffff814fec10 FS: 00007f87bc0dc780(0000) GS:ffff88803e600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa0206000 CR3: 0000000010b70000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: bpf_trampoline_update+0x25a/0x6b0 __bpf_trampoline_link_prog+0x101/0x240 bpf_trampoline_link_prog+0x2d/0x50 bpf_tracing_prog_attach+0x24c/0x530 bpf_raw_tp_link_attach+0x73/0x1d0 __sys_bpf+0x100e/0x2570 __x64_sys_bpf+0x1c/0x30 do_syscall_64+0x5b/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd With this patch, when modify_fentry or register_fentry returns -EAGAIN from bpf_tramp_ftrace_ops_func, the pageattr of im->image will be reset to nx+rw. Cc: stable@vger.kernel.org Fixes: 00963a2e75a8 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)") Signed-off-by: Chuang Wang Acked-by: Jiri Olsa Acked-by: Song Liu Link: https://lore.kernel.org/r/20221224133146.780578-1-nashuiliang@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 11f5ec0b8016..d0ed7d6f5eec 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -488,6 +488,10 @@ again: /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; + + /* reset im->image memory attr for arch_prepare_bpf_trampoline */ + set_memory_nx((long)im->image, 1); + set_memory_rw((long)im->image, 1); goto again; } #endif -- cgit From 8f161ca1105a6af6614333f13aa7be4aab8b633a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Dec 2022 13:55:57 -0800 Subject: selftests/bpf: Temporarily disable part of btf_dump:var_data test. Commit 7443b296e699 ("x86/percpu: Move cpu_number next to current_task") moved global per_cpu variable 'cpu_number' into pcpu_hot structure. Therefore this part of var_data test is no longer valid. Disable it until better solution is found. Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 0ba2e8b9c6ac..e9ea38aa8248 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -801,7 +801,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, char *str) { -#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) +#if 0 TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, "int cpu_number = (int)100", 100); #endif -- cgit From 7ff94f276f8ea05df82eb115225e9b26f47a3347 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 16 Dec 2022 14:18:54 -0800 Subject: bpf: keep a reference to the mm, in case the task is dead. Fix the system crash that happens when a task iterator travel through vma of tasks. In task iterators, we used to access mm by following the pointer on the task_struct; however, the death of a task will clear the pointer, even though we still hold the task_struct. That can cause an unexpected crash for a null pointer when an iterator is visiting a task that dies during the visit. Keeping a reference of mm on the iterator ensures we always have a valid pointer to mm. Co-developed-by: Song Liu Signed-off-by: Song Liu Signed-off-by: Kui-Feng Lee Reported-by: Nathan Slingerland Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221216221855.4122288-2-kuifeng@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c2a2182ce570..c4ab9d6cdbe9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -438,6 +438,7 @@ struct bpf_iter_seq_task_vma_info { */ struct bpf_iter_seq_task_common common; struct task_struct *task; + struct mm_struct *mm; struct vm_area_struct *vma; u32 tid; unsigned long prev_vm_start; @@ -456,16 +457,19 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; + struct mm_struct *curr_mm; u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to - * the task_struct, and holds read lock on vma->mm->mmap_lock. + * the task_struct, holds a refcount on mm->mm_users, and holds + * read lock on vma->mm->mmap_lock. * If this function returns NULL, it does not hold any reference or * lock. */ if (info->task) { curr_task = info->task; curr_vma = info->vma; + curr_mm = info->mm; /* In case of lock contention, drop mmap_lock to unblock * the writer. * @@ -504,13 +508,15 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) * 4.2) VMA2 and VMA2' covers different ranges, process * VMA2'. */ - if (mmap_lock_is_contended(curr_task->mm)) { + if (mmap_lock_is_contended(curr_mm)) { info->prev_vm_start = curr_vma->vm_start; info->prev_vm_end = curr_vma->vm_end; op = task_vma_iter_find_vma; - mmap_read_unlock(curr_task->mm); - if (mmap_read_lock_killable(curr_task->mm)) + mmap_read_unlock(curr_mm); + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } else { op = task_vma_iter_next_vma; } @@ -535,42 +541,47 @@ again: op = task_vma_iter_find_vma; } - if (!curr_task->mm) + curr_mm = get_task_mm(curr_task); + if (!curr_mm) goto next_task; - if (mmap_read_lock_killable(curr_task->mm)) + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } switch (op) { case task_vma_iter_first_vma: - curr_vma = find_vma(curr_task->mm, 0); + curr_vma = find_vma(curr_mm, 0); break; case task_vma_iter_next_vma: - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma * to find the next vma. This is similar to the mechanism * in show_smaps_rollup(). */ - curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); /* case 1) and 4.2) above just use curr_vma */ /* check for case 2) or case 4.1) above */ if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; } if (!curr_vma) { /* case 3) above, or case 2) 4.1) with vma->next == NULL */ - mmap_read_unlock(curr_task->mm); + mmap_read_unlock(curr_mm); + mmput(curr_mm); goto next_task; } info->task = curr_task; info->vma = curr_vma; + info->mm = curr_mm; return curr_vma; next_task: @@ -579,6 +590,7 @@ next_task: put_task_struct(curr_task); info->task = NULL; + info->mm = NULL; info->tid++; goto again; @@ -587,6 +599,7 @@ finish: put_task_struct(curr_task); info->task = NULL; info->vma = NULL; + info->mm = NULL; return NULL; } @@ -658,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v) */ info->prev_vm_start = ~0UL; info->prev_vm_end = info->vma->vm_end; - mmap_read_unlock(info->task->mm); + mmap_read_unlock(info->mm); + mmput(info->mm); + info->mm = NULL; put_task_struct(info->task); info->task = NULL; } -- cgit From b7793c8db7d9beb903bb42f52872b5b46abdcb88 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 16 Dec 2022 14:18:55 -0800 Subject: selftests/bpf: add a test for iter/task_vma for short-lived processes When a task iterator traverses vma(s), it is possible task->mm might become invalid in the middle of traversal and this may cause kernel misbehave (e.g., crash) This test case creates iterators repeatedly and forks short-lived processes in the background to detect this bug. The test will last for 3 seconds to get the chance to trigger the issue. Signed-off-by: Kui-Feng Lee Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221216221855.4122288-3-kuifeng@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 73 +++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 6f8ed61fc4b4..3af6450763e9 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -1465,6 +1465,77 @@ out: bpf_iter_task_vma__destroy(skel); } +static void test_task_vma_dead_task(void) +{ + struct bpf_iter_task_vma *skel; + int wstatus, child_pid = -1; + time_t start_tm, cur_tm; + int err, iter_fd = -1; + int wait_sec = 3; + + skel = bpf_iter_task_vma__open(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_task_vma__open")) + return; + + skel->bss->pid = getpid(); + + err = bpf_iter_task_vma__load(skel); + if (!ASSERT_OK(err, "bpf_iter_task_vma__load")) + goto out; + + skel->links.proc_maps = bpf_program__attach_iter( + skel->progs.proc_maps, NULL); + + if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) { + skel->links.proc_maps = NULL; + goto out; + } + + start_tm = time(NULL); + cur_tm = start_tm; + + child_pid = fork(); + if (child_pid == 0) { + /* Fork short-lived processes in the background. */ + while (cur_tm < start_tm + wait_sec) { + system("echo > /dev/null"); + cur_tm = time(NULL); + } + exit(0); + } + + if (!ASSERT_GE(child_pid, 0, "fork_child")) + goto out; + + while (cur_tm < start_tm + wait_sec) { + iter_fd = bpf_iter_create(bpf_link__fd(skel->links.proc_maps)); + if (!ASSERT_GE(iter_fd, 0, "create_iter")) + goto out; + + /* Drain all data from iter_fd. */ + while (cur_tm < start_tm + wait_sec) { + err = read_fd_into_buffer(iter_fd, task_vma_output, CMP_BUFFER_SIZE); + if (!ASSERT_GE(err, 0, "read_iter_fd")) + goto out; + + cur_tm = time(NULL); + + if (err == 0) + break; + } + + close(iter_fd); + iter_fd = -1; + } + + check_bpf_link_info(skel->progs.proc_maps); + +out: + waitpid(child_pid, &wstatus, 0); + close(iter_fd); + bpf_iter_task_vma__destroy(skel); +} + void test_bpf_sockmap_map_iter_fd(void) { struct bpf_iter_sockmap *skel; @@ -1586,6 +1657,8 @@ void test_bpf_iter(void) test_task_file(); if (test__start_subtest("task_vma")) test_task_vma(); + if (test__start_subtest("task_vma_dead_task")) + test_task_vma_dead_task(); if (test__start_subtest("task_btf")) test_task_btf(); if (test__start_subtest("tcp4")) -- cgit From 45435d8da71f9f3e6860e6e6ea9667b6ec17ec64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Dec 2022 10:28:44 -0800 Subject: bpf: Always use maximal size for copy_array() Instead of counting on prior allocations to have sized allocations to the next kmalloc bucket size, always perform a krealloc that is at least ksize(dst) in size (which is a no-op), so the size can be correctly tracked by all the various allocation size trackers (KASAN, __alloc_size, etc). Reported-by: Hyunwoo Kim Link: https://lore.kernel.org/bpf/20221223094551.GA1439509@ubuntu Fixes: ceb35b666d42 ("bpf/verifier: Use kmalloc_size_roundup() to match ksize() usage") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: John Fastabend Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: bpf@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221223182836.never.866-kees@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 243d06ce6842..85f96c1e9f62 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1054,6 +1054,8 @@ static void print_insn_state(struct bpf_verifier_env *env, */ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags) { + size_t alloc_bytes; + void *orig = dst; size_t bytes; if (ZERO_OR_NULL_PTR(src)) @@ -1062,11 +1064,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (ksize(dst) < ksize(src)) { - kfree(dst); - dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags); - if (!dst) - return NULL; + alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes)); + dst = krealloc(orig, alloc_bytes, flags); + if (!dst) { + kfree(orig); + return NULL; } memcpy(dst, src, bytes); -- cgit From da8daff9405e55baa1f797b77a7c629a89f4d764 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sat, 17 Dec 2022 11:21:48 +0530 Subject: kconfig: Add static text for search information in help menu Add few static text to explain how one can bring up the search dialog box by pressing the forward slash key anywhere on this interface. Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- scripts/kconfig/mconf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c index 9c549683c627..e67e0db50b2e 100644 --- a/scripts/kconfig/mconf.c +++ b/scripts/kconfig/mconf.c @@ -161,6 +161,12 @@ static const char mconf_readme[] = "(especially with a larger number of unrolled categories) than the\n" "default mode.\n" "\n" + +"Search\n" +"-------\n" +"Pressing the forward-slash (/) anywhere brings up a search dialog box.\n" +"\n" + "Different color themes available\n" "--------------------------------\n" "It is possible to select different color themes using the variable\n" -- cgit From 56c5dab20a6391604df9521f812c01d1e3fe1bd0 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 12 Dec 2022 13:04:11 +0100 Subject: RDMA/srp: Move large values to a new enum for gcc13 Since gcc13, each member of an enum has the same type as the enum [1]. And that is inherited from its members. Provided these two: SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31 all other members are unsigned ints. Esp. with SRP_MAX_SGE and SRP_TSK_MGMT_SQ_SIZE and their use in min(), this results in the following warnings: include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast drivers/infiniband/ulp/srp/ib_srp.c:563:42: note: in expansion of macro 'min' include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast drivers/infiniband/ulp/srp/ib_srp.c:2369:27: note: in expansion of macro 'min' So move the large values away to a separate enum, so that they don't affect other members. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36113 Link: https://lore.kernel.org/r/20221212120411.13750-1-jirislaby@kernel.org Signed-off-by: Jiri Slaby (SUSE) Reviewed-by: Bart Van Assche Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/srp/ib_srp.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 00b0068fda20..5d94db453df3 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -62,9 +62,6 @@ enum { SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, - SRP_TAG_NO_REQ = ~0U, - SRP_TAG_TSK_MGMT = 1U << 31, - SRP_MAX_PAGES_PER_MR = 512, SRP_MAX_ADD_CDB_LEN = 16, @@ -79,6 +76,11 @@ enum { sizeof(struct srp_imm_buf), }; +enum { + SRP_TAG_NO_REQ = ~0U, + SRP_TAG_TSK_MGMT = BIT(31), +}; + enum srp_target_state { SRP_TARGET_SCANNING, SRP_TARGET_LIVE, -- cgit From 936a192f974018b4f6040f6f77b1cc1e75bd8666 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 26 Dec 2022 22:27:52 +0900 Subject: tcp: Add TIME_WAIT sockets in bhash2. Jiri Slaby reported regression of bind() with a simple repro. [0] The repro creates a TIME_WAIT socket and tries to bind() a new socket with the same local address and port. Before commit 28044fc1d495 ("net: Add a bhash2 table hashed by port and address"), the bind() failed with -EADDRINUSE, but now it succeeds. The cited commit should have put TIME_WAIT sockets into bhash2; otherwise, inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind() requests if the address is not a wildcard one. The straight option is to move sk_bind2_node from struct sock to struct sock_common to add twsk to bhash2 as implemented as RFC. [1] However, the binary layout change in the struct sock could affect performances moving hot fields on different cachelines. To avoid that, we add another TIME_WAIT list in inet_bind2_bucket and check it while validating bind(). [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/ [1]: https://lore.kernel.org/netdev/20221221151258.25748-2-kuniyu@amazon.com/ Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: Jiri Slaby Suggested-by: Paolo Abeni Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++++ include/net/inet_timewait_sock.h | 5 +++++ net/ipv4/inet_connection_sock.c | 26 ++++++++++++++++++++++---- net/ipv4/inet_hashtables.c | 8 +++++--- net/ipv4/inet_timewait_sock.c | 31 +++++++++++++++++++++++++++++-- 5 files changed, 65 insertions(+), 9 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 69174093078f..99bd823e97f6 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -108,6 +108,10 @@ struct inet_bind2_bucket { struct hlist_node node; /* List of sockets hashed to this bucket */ struct hlist_head owners; + /* bhash has twsk in owners, but bhash2 has twsk in + * deathrow not to add a member in struct sock_common. + */ + struct hlist_head deathrow; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 5b47545f22d3..4a8e578405cb 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -73,9 +73,14 @@ struct inet_timewait_sock { u32 tw_priority; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; + struct inet_bind2_bucket *tw_tb2; + struct hlist_node tw_bind2_node; }; #define tw_tclass tw_tos +#define twsk_for_each_bound_bhash2(__tw, list) \ + hlist_for_each_entry(__tw, list, tw_bind2_node) + static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) { return (struct inet_timewait_sock *)sk; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b366ab9148f2..848ffc3e0239 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -173,22 +173,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, return false; } +static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, + kuid_t sk_uid, bool relax, + bool reuseport_cb_ok, bool reuseport_ok) +{ + if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) + return false; + + return inet_bind_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok); +} + static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { + struct inet_timewait_sock *tw2; struct sock *sk2; sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - continue; + if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok)) + return true; + } - if (inet_bind_conflict(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) + twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { + sk2 = (struct sock *)tw2; + + if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok)) return true; } + return false; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d039b4e732a3..24a38b56fab9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, #endif tb->rcv_saddr = sk->sk_rcv_saddr; INIT_HLIST_HEAD(&tb->owners); + INIT_HLIST_HEAD(&tb->deathrow); hlist_add_head(&tb->node, &head->chain); } @@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } @@ -1103,15 +1104,16 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); - spin_unlock(&head2->lock); - if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); + + spin_unlock(&head2->lock); spin_unlock(&head->lock); + if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 66fc940f9521..1d77d992e6e7 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -29,6 +29,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { + struct inet_bind2_bucket *tb2 = tw->tw_tb2; struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) @@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + + __hlist_del(&tw->tw_bind2_node); + tw->tw_tb2 = NULL; + inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + __sock_put((struct sock *)tw); } @@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - struct inet_bind_hashbucket *bhead; + struct inet_bind_hashbucket *bhead, *bhead2; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); @@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; + bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw, + twsk_net(tw), tw->tw_num); spin_lock(&bhead->lock); + spin_lock(&bhead2->lock); inet_twsk_bind_unhash(tw, hashinfo); + spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); refcount_dec(&tw->tw_dr->tw_refcount); @@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, hlist_add_head(&tw->tw_bind_node, list); } +static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind2_node, list); +} + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - struct inet_bind_hashbucket *bhead; + struct inet_bind_hashbucket *bhead, *bhead2; + /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; + bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); + spin_lock(&bhead->lock); + spin_lock(&bhead2->lock); + tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + + tw->tw_tb2 = icsk->icsk_bind2_hash; + WARN_ON(!icsk->icsk_bind2_hash); + inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); + + spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); spin_lock(lock); -- cgit From 2c042e8e54efb2b8e25ed0cb28224e79948dc8ce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 26 Dec 2022 22:27:53 +0900 Subject: tcp: Add selftest for bind() and TIME_WAIT. bhash2 split the bind() validation logic into wildcard and non-wildcard cases. Let's add a test to catch future regression. Before the previous patch: # ./bind_timewait TAP version 13 1..2 # Starting 2 tests from 3 test cases. # RUN bind_timewait.localhost.1 ... # bind_timewait.c:87:1:Expected ret (0) == -1 (-1) # 1: Test terminated by assertion # FAIL bind_timewait.localhost.1 not ok 1 bind_timewait.localhost.1 # RUN bind_timewait.addrany.1 ... # OK bind_timewait.addrany.1 ok 2 bind_timewait.addrany.1 # FAILED: 1 / 2 tests passed. # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 After: # ./bind_timewait TAP version 13 1..2 # Starting 2 tests from 3 test cases. # RUN bind_timewait.localhost.1 ... # OK bind_timewait.localhost.1 ok 1 bind_timewait.localhost.1 # RUN bind_timewait.addrany.1 ... # OK bind_timewait.addrany.1 ok 2 bind_timewait.addrany.1 # PASSED: 2 / 2 tests passed. # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Signed-off-by: David S. Miller --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/bind_timewait.c | 92 +++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 tools/testing/selftests/net/bind_timewait.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 9cc84114741d..a6911cae368c 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only bind_bhash +bind_timewait csum cmsg_sender diag_uid diff --git a/tools/testing/selftests/net/bind_timewait.c b/tools/testing/selftests/net/bind_timewait.c new file mode 100644 index 000000000000..cb9fdf51ea59 --- /dev/null +++ b/tools/testing/selftests/net/bind_timewait.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include +#include + +#include "../kselftest_harness.h" + +FIXTURE(bind_timewait) +{ + struct sockaddr_in addr; + socklen_t addrlen; +}; + +FIXTURE_VARIANT(bind_timewait) +{ + __u32 addr_const; +}; + +FIXTURE_VARIANT_ADD(bind_timewait, localhost) +{ + .addr_const = INADDR_LOOPBACK +}; + +FIXTURE_VARIANT_ADD(bind_timewait, addrany) +{ + .addr_const = INADDR_ANY +}; + +FIXTURE_SETUP(bind_timewait) +{ + self->addr.sin_family = AF_INET; + self->addr.sin_port = 0; + self->addr.sin_addr.s_addr = htonl(variant->addr_const); + self->addrlen = sizeof(self->addr); +} + +FIXTURE_TEARDOWN(bind_timewait) +{ +} + +void create_timewait_socket(struct __test_metadata *_metadata, + FIXTURE_DATA(bind_timewait) *self) +{ + int server_fd, client_fd, child_fd, ret; + struct sockaddr_in addr; + socklen_t addrlen; + + server_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GT(server_fd, 0); + + ret = bind(server_fd, (struct sockaddr *)&self->addr, self->addrlen); + ASSERT_EQ(ret, 0); + + ret = listen(server_fd, 1); + ASSERT_EQ(ret, 0); + + ret = getsockname(server_fd, (struct sockaddr *)&self->addr, &self->addrlen); + ASSERT_EQ(ret, 0); + + client_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GT(client_fd, 0); + + ret = connect(client_fd, (struct sockaddr *)&self->addr, self->addrlen); + ASSERT_EQ(ret, 0); + + addrlen = sizeof(addr); + child_fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen); + ASSERT_GT(child_fd, 0); + + close(child_fd); + close(client_fd); + close(server_fd); +} + +TEST_F(bind_timewait, 1) +{ + int fd, ret; + + create_timewait_socket(_metadata, self); + + fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GT(fd, 0); + + ret = bind(fd, (struct sockaddr *)&self->addr, self->addrlen); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EADDRINUSE); + + close(fd); +} + +TEST_HARNESS_MAIN -- cgit From 6b57bffa5f675a01c7981ed271e8521e87441abd Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 27 Dec 2022 22:45:07 +0100 Subject: net: ethernet: broadcom: bcm63xx_enet: Drop empty platform remove function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A remove callback just returning 0 is equivalent to no remove callback at all. So drop the useless function. Signed-off-by: Uwe Kleine-König Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index d91fdb0c2649..2cf96892e565 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -2784,17 +2784,11 @@ static int bcm_enet_shared_probe(struct platform_device *pdev) return 0; } -static int bcm_enet_shared_remove(struct platform_device *pdev) -{ - return 0; -} - /* this "shared" driver is needed because both macs share a single * address space */ struct platform_driver bcm63xx_enet_shared_driver = { .probe = bcm_enet_shared_probe, - .remove = bcm_enet_shared_remove, .driver = { .name = "bcm63xx_enet_shared", .owner = THIS_MODULE, -- cgit From af691c94d022440476b76560d310d6fea790cc60 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 27 Dec 2022 22:45:08 +0100 Subject: net: ethernet: freescale: enetc: Drop empty platform remove function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A remove callback just returning 0 is equivalent to no remove callback at all. So drop the useless function. Signed-off-by: Uwe Kleine-König Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_ierb.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ierb.c b/drivers/net/ethernet/freescale/enetc/enetc_ierb.c index 91f02c505028..b307bef4dc29 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ierb.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ierb.c @@ -127,11 +127,6 @@ static int enetc_ierb_probe(struct platform_device *pdev) return 0; } -static int enetc_ierb_remove(struct platform_device *pdev) -{ - return 0; -} - static const struct of_device_id enetc_ierb_match[] = { { .compatible = "fsl,ls1028a-enetc-ierb", }, {}, @@ -144,7 +139,6 @@ static struct platform_driver enetc_ierb_driver = { .of_match_table = enetc_ierb_match, }, .probe = enetc_ierb_probe, - .remove = enetc_ierb_remove, }; module_platform_driver(enetc_ierb_driver); -- cgit From fec7352117fa301bfbc31bacc14bb9a579376b36 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 28 Dec 2022 14:27:49 +0800 Subject: net: hns3: refine the handling for VF heartbeat Currently, the PF check the VF alive by the KEEP_ALVE mailbox from VF. VF keep sending the mailbox per 2 seconds. Once PF lost the mailbox for more than 8 seconds, it will regards the VF is abnormal, and stop notifying the state change to VF, include link state, vf mac, reset, even though it receives the KEEP_ALIVE mailbox again. It's inreasonable. This patch fixes it. PF will record the state change which need to notify VF when lost the VF's KEEP_ALIVE mailbox. And notify VF when receive the mailbox again. Introduce a new flag HCLGE_VPORT_STATE_INITED, used to distinguish the case whether VF driver loaded or not. For VF will query these states when initializing, so it's unnecessary to notify it in this case. Fixes: aa5c4f175be6 ("net: hns3: add reset handling for VF when doing PF reset") Signed-off-by: Jian Shen Signed-off-by: Hao Lan Reported-by: kernel test robot Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 57 ++++++++++++----- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 7 +++ .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 71 +++++++++++++++++++--- 3 files changed, 112 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6c2742f59c77..07ad5f35219e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3910,9 +3910,17 @@ static int hclge_set_all_vf_rst(struct hclge_dev *hdev, bool reset) return ret; } - if (!reset || !test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) + if (!reset || + !test_bit(HCLGE_VPORT_STATE_INITED, &vport->state)) continue; + if (!test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state) && + hdev->reset_type == HNAE3_FUNC_RESET) { + set_bit(HCLGE_VPORT_NEED_NOTIFY_RESET, + &vport->need_notify); + continue; + } + /* Inform VF to process the reset. * hclge_inform_reset_assert_to_vf may fail if VF * driver is not loaded. @@ -4609,18 +4617,25 @@ static void hclge_reset_service_task(struct hclge_dev *hdev) static void hclge_update_vport_alive(struct hclge_dev *hdev) { +#define HCLGE_ALIVE_SECONDS_NORMAL 8 + + unsigned long alive_time = HCLGE_ALIVE_SECONDS_NORMAL * HZ; int i; /* start from vport 1 for PF is always alive */ for (i = 1; i < hdev->num_alloc_vport; i++) { struct hclge_vport *vport = &hdev->vport[i]; - if (time_after(jiffies, vport->last_active_jiffies + 8 * HZ)) + if (!test_bit(HCLGE_VPORT_STATE_INITED, &vport->state) || + !test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) + continue; + if (time_after(jiffies, vport->last_active_jiffies + + alive_time)) { clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); - - /* If vf is not alive, set to default value */ - if (!test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) - vport->mps = HCLGE_MAC_DEFAULT_FRAME; + dev_warn(&hdev->pdev->dev, + "VF %u heartbeat timeout\n", + i - HCLGE_VF_VPORT_START_NUM); + } } } @@ -8064,9 +8079,11 @@ int hclge_vport_start(struct hclge_vport *vport) { struct hclge_dev *hdev = vport->back; + set_bit(HCLGE_VPORT_STATE_INITED, &vport->state); set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); set_bit(HCLGE_VPORT_STATE_PROMISC_CHANGE, &vport->state); vport->last_active_jiffies = jiffies; + vport->need_notify = 0; if (test_bit(vport->vport_id, hdev->vport_config_block)) { if (vport->vport_id) { @@ -8084,7 +8101,9 @@ int hclge_vport_start(struct hclge_vport *vport) void hclge_vport_stop(struct hclge_vport *vport) { + clear_bit(HCLGE_VPORT_STATE_INITED, &vport->state); clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); + vport->need_notify = 0; } static int hclge_client_start(struct hnae3_handle *handle) @@ -9208,7 +9227,8 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf, return 0; } - dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %s\n", + dev_info(&hdev->pdev->dev, + "MAC of VF %d has been set to %s, will be active after VF reset\n", vf, format_mac_addr); return 0; } @@ -10465,12 +10485,16 @@ static int hclge_set_vf_vlan_filter(struct hnae3_handle *handle, int vfid, * for DEVICE_VERSION_V3, vf doesn't need to know about the port based * VLAN state. */ - if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3 && - test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) - (void)hclge_push_vf_port_base_vlan_info(&hdev->vport[0], - vport->vport_id, - state, &vlan_info); - + if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3) { + if (test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) + (void)hclge_push_vf_port_base_vlan_info(&hdev->vport[0], + vport->vport_id, + state, + &vlan_info); + else + set_bit(HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, + &vport->need_notify); + } return 0; } @@ -11941,7 +11965,7 @@ static void hclge_reset_vport_state(struct hclge_dev *hdev) int i; for (i = 0; i < hdev->num_alloc_vport; i++) { - hclge_vport_stop(vport); + clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); vport++; } } @@ -12955,6 +12979,11 @@ static void hclge_clear_vport_vf_info(struct hclge_vport *vport, int vfid) struct hclge_vlan_info vlan_info; int ret; + clear_bit(HCLGE_VPORT_STATE_INITED, &vport->state); + clear_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); + vport->need_notify = 0; + vport->mps = 0; + /* after disable sriov, clean VF rate configured by PF */ ret = hclge_tm_qs_shaper_cfg(vport, 0); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 495b639b0dc2..13f23d606e77 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -995,9 +995,15 @@ enum HCLGE_VPORT_STATE { HCLGE_VPORT_STATE_MAC_TBL_CHANGE, HCLGE_VPORT_STATE_PROMISC_CHANGE, HCLGE_VPORT_STATE_VLAN_FLTR_CHANGE, + HCLGE_VPORT_STATE_INITED, HCLGE_VPORT_STATE_MAX }; +enum HCLGE_VPORT_NEED_NOTIFY { + HCLGE_VPORT_NEED_NOTIFY_RESET, + HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, +}; + struct hclge_vlan_info { u16 vlan_proto; /* so far support 802.1Q only */ u16 qos; @@ -1044,6 +1050,7 @@ struct hclge_vport { struct hnae3_handle roce; unsigned long state; + unsigned long need_notify; unsigned long last_active_jiffies; u32 mps; /* Max packet size */ struct hclge_vf_info vf_info; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index a7b06c63143c..04ff9bf12185 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -124,17 +124,26 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len, return status; } +static int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type) +{ + __le16 msg_data; + u8 dest_vfid; + + dest_vfid = (u8)vport->vport_id; + msg_data = cpu_to_le16(reset_type); + + /* send this requested info to VF */ + return hclge_send_mbx_msg(vport, (u8 *)&msg_data, sizeof(msg_data), + HCLGE_MBX_ASSERTING_RESET, dest_vfid); +} + int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport) { struct hclge_dev *hdev = vport->back; - __le16 msg_data; u16 reset_type; - u8 dest_vfid; BUILD_BUG_ON(HNAE3_MAX_RESET > U16_MAX); - dest_vfid = (u8)vport->vport_id; - if (hdev->reset_type == HNAE3_FUNC_RESET) reset_type = HNAE3_VF_PF_FUNC_RESET; else if (hdev->reset_type == HNAE3_FLR_RESET) @@ -142,11 +151,7 @@ int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport) else reset_type = HNAE3_VF_FUNC_RESET; - msg_data = cpu_to_le16(reset_type); - - /* send this requested info to VF */ - return hclge_send_mbx_msg(vport, (u8 *)&msg_data, sizeof(msg_data), - HCLGE_MBX_ASSERTING_RESET, dest_vfid); + return hclge_inform_vf_reset(vport, reset_type); } static void hclge_free_vector_ring_chain(struct hnae3_ring_chain_node *head) @@ -652,9 +657,56 @@ static int hclge_reset_vf(struct hclge_vport *vport) return hclge_func_reset_cmd(hdev, vport->vport_id); } +static void hclge_notify_vf_config(struct hclge_vport *vport) +{ + struct hclge_dev *hdev = vport->back; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); + struct hclge_port_base_vlan_config *vlan_cfg; + int ret; + + hclge_push_vf_link_status(vport); + if (test_bit(HCLGE_VPORT_NEED_NOTIFY_RESET, &vport->need_notify)) { + ret = hclge_inform_vf_reset(vport, HNAE3_VF_PF_FUNC_RESET); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to inform VF %u reset!", + vport->vport_id - HCLGE_VF_VPORT_START_NUM); + return; + } + vport->need_notify = 0; + return; + } + + if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3 && + test_bit(HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, &vport->need_notify)) { + vlan_cfg = &vport->port_base_vlan_cfg; + ret = hclge_push_vf_port_base_vlan_info(&hdev->vport[0], + vport->vport_id, + vlan_cfg->state, + &vlan_cfg->vlan_info); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to inform VF %u port base vlan!", + vport->vport_id - HCLGE_VF_VPORT_START_NUM); + return; + } + clear_bit(HCLGE_VPORT_NEED_NOTIFY_VF_VLAN, &vport->need_notify); + } +} + static void hclge_vf_keep_alive(struct hclge_vport *vport) { + struct hclge_dev *hdev = vport->back; + vport->last_active_jiffies = jiffies; + + if (test_bit(HCLGE_VPORT_STATE_INITED, &vport->state) && + !test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) { + set_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state); + dev_info(&hdev->pdev->dev, "VF %u is alive!", + vport->vport_id - HCLGE_VF_VPORT_START_NUM); + hclge_notify_vf_config(vport); + } } static int hclge_set_vf_mtu(struct hclge_vport *vport, @@ -954,6 +1006,7 @@ static int hclge_mbx_vf_uninit_handler(struct hclge_mbx_ops_param *param) hclge_rm_vport_all_mac_table(param->vport, true, HCLGE_MAC_ADDR_MC); hclge_rm_vport_all_vlan_table(param->vport, true); + param->vport->mps = 0; return 0; } -- cgit From d530ece70f16f912e1d1bfeea694246ab78b0a4b Mon Sep 17 00:00:00 2001 From: Jiguang Xiao Date: Wed, 28 Dec 2022 16:14:47 +0800 Subject: net: amd-xgbe: add missed tasklet_kill The driver does not call tasklet_kill in several places. Add the calls to fix it. Fixes: 85b85c853401 ("amd-xgbe: Re-issue interrupt if interrupt status not cleared") Signed-off-by: Jiguang Xiao Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 3 +++ drivers/net/ethernet/amd/xgbe/xgbe-i2c.c | 4 +++- drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 7b666106feee..614c0278419b 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1064,6 +1064,9 @@ static void xgbe_free_irqs(struct xgbe_prv_data *pdata) devm_free_irq(pdata->dev, pdata->dev_irq, pdata); + tasklet_kill(&pdata->tasklet_dev); + tasklet_kill(&pdata->tasklet_ecc); + if (pdata->vdata->ecc_support && (pdata->dev_irq != pdata->ecc_irq)) devm_free_irq(pdata->dev, pdata->ecc_irq, pdata); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c b/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c index 22d4fc547a0a..a9ccc4258ee5 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c @@ -447,8 +447,10 @@ static void xgbe_i2c_stop(struct xgbe_prv_data *pdata) xgbe_i2c_disable(pdata); xgbe_i2c_clear_all_interrupts(pdata); - if (pdata->dev_irq != pdata->i2c_irq) + if (pdata->dev_irq != pdata->i2c_irq) { devm_free_irq(pdata->dev, pdata->i2c_irq, pdata); + tasklet_kill(&pdata->tasklet_i2c); + } } static int xgbe_i2c_start(struct xgbe_prv_data *pdata) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 4e97b4869522..0c5c1b155683 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1390,8 +1390,10 @@ static void xgbe_phy_stop(struct xgbe_prv_data *pdata) /* Disable auto-negotiation */ xgbe_an_disable_all(pdata); - if (pdata->dev_irq != pdata->an_irq) + if (pdata->dev_irq != pdata->an_irq) { devm_free_irq(pdata->dev, pdata->an_irq, pdata); + tasklet_kill(&pdata->tasklet_an); + } pdata->phy_if.phy_impl.stop(pdata); -- cgit From 1573c6882018f69991aead951d09423ce978adac Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Thu, 29 Dec 2022 13:41:06 +0800 Subject: selftests: net: fix cmsg_so_mark.sh test hang This cmsg_so_mark.sh test will hang on non-amd64 systems because of the infinity loop for argument parsing in cmsg_sender. Variable "o" in cs_parse_args() for taking getopt() should be an int, otherwise it will be 255 when getopt() returns -1 on non-amd64 system and thus causing infinity loop. Link: https://lore.kernel.org/lkml/CA+G9fYsM2k7mrF7W4V_TrZ-qDauWM394=8yEJ=-t1oUg8_40YA@mail.gmail.com/t/ Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/cmsg_sender.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 75dd83e39207..24b21b15ed3f 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -110,7 +110,7 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) static void cs_parse_args(int argc, char *argv[]) { - char o; + int o; while ((o = getopt(argc, argv, "46sS:p:m:M:d:tf:F:c:C:l:L:H:")) != -1) { switch (o) { -- cgit From 332b49ff637d6c1a75b971022a8b992cf3c57db1 Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:05 +0000 Subject: net: ena: Fix toeplitz initial hash value On driver initialization, RSS hash initial value is set to zero, instead of the default value. This happens because we pass NULL as the RSS key parameter, which caused us to never initialize the RSS hash value. This patch fixes it by making sure the initial value is set, no matter what the value of the RSS key is. Fixes: 91a65b7d3ed8 ("net: ena: fix potential crash when rxfh key is NULL") Signed-off-by: Nati Koler Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 8c8b4c88c7de..451c3a1b6255 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -2400,29 +2400,18 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, return -EOPNOTSUPP; } - switch (func) { - case ENA_ADMIN_TOEPLITZ: - if (key) { - if (key_len != sizeof(hash_key->key)) { - netdev_err(ena_dev->net_device, - "key len (%u) doesn't equal the supported size (%zu)\n", - key_len, sizeof(hash_key->key)); - return -EINVAL; - } - memcpy(hash_key->key, key, key_len); - rss->hash_init_val = init_val; - hash_key->key_parts = key_len / sizeof(hash_key->key[0]); + if ((func == ENA_ADMIN_TOEPLITZ) && key) { + if (key_len != sizeof(hash_key->key)) { + netdev_err(ena_dev->net_device, + "key len (%u) doesn't equal the supported size (%zu)\n", + key_len, sizeof(hash_key->key)); + return -EINVAL; } - break; - case ENA_ADMIN_CRC32: - rss->hash_init_val = init_val; - break; - default: - netdev_err(ena_dev->net_device, "Invalid hash function (%d)\n", - func); - return -EINVAL; + memcpy(hash_key->key, key, key_len); + hash_key->key_parts = key_len / sizeof(hash_key->key[0]); } + rss->hash_init_val = init_val; old_func = rss->hash_func; rss->hash_func = func; rc = ena_com_set_hash_function(ena_dev); -- cgit From 9c9e539956fa67efb8a65e32b72a853740b33445 Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:06 +0000 Subject: net: ena: Don't register memory info on XDP exchange Since the queues aren't destroyed when we only exchange XDP programs, there's no need to re-register them again. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index a95529a69cbb..6ba9b06719a0 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -512,16 +512,18 @@ static void ena_xdp_exchange_program_rx_in_range(struct ena_adapter *adapter, struct bpf_prog *prog, int first, int count) { + struct bpf_prog *old_bpf_prog; struct ena_ring *rx_ring; int i = 0; for (i = first; i < count; i++) { rx_ring = &adapter->rx_ring[i]; - xchg(&rx_ring->xdp_bpf_prog, prog); - if (prog) { + old_bpf_prog = xchg(&rx_ring->xdp_bpf_prog, prog); + + if (!old_bpf_prog && prog) { ena_xdp_register_rxq_info(rx_ring); rx_ring->rx_headroom = XDP_PACKET_HEADROOM; - } else { + } else if (old_bpf_prog && !prog) { ena_xdp_unregister_rxq_info(rx_ring); rx_ring->rx_headroom = NET_SKB_PAD; } -- cgit From c7f5e34d906320fdc996afa616676161c029cc02 Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:07 +0000 Subject: net: ena: Account for the number of processed bytes in XDP The size of packets that were forwarded or dropped by XDP wasn't added to the total processed bytes statistic. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 6ba9b06719a0..9ae86bd3d457 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1719,6 +1719,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, } if (xdp_verdict != XDP_PASS) { xdp_flags |= xdp_verdict; + total_len += ena_rx_ctx.ena_bufs[0].len; res_budget--; continue; } -- cgit From 59811faa2c54dbcf44d575b5a8f6e7077da88dc2 Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:08 +0000 Subject: net: ena: Use bitmask to indicate packet redirection Redirecting packets with XDP Redirect is done in two phases: 1. A packet is passed by the driver to the kernel using xdp_do_redirect(). 2. After finishing polling for new packets the driver lets the kernel know that it can now process the redirected packet using xdp_do_flush_map(). The packets' redirection is handled in the napi context of the queue that called xdp_do_redirect() To avoid calling xdp_do_flush_map() each time the driver first checks whether any packets were redirected, using xdp_flags |= xdp_verdict; and if (xdp_flags & XDP_REDIRECT) xdp_do_flush_map() essentially treating XDP instructions as a bitmask, which isn't the case: enum xdp_action { XDP_ABORTED = 0, XDP_DROP, XDP_PASS, XDP_TX, XDP_REDIRECT, }; Given the current possible values of xdp_action, the current design doesn't have a bug (since XDP_REDIRECT = 100b), but it is still flawed. This patch makes the driver use a bitmask instead, to avoid future issues. Fixes: a318c70ad152 ("net: ena: introduce XDP redirect implementation") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 26 ++++++++++++++++---------- drivers/net/ethernet/amazon/ena/ena_netdev.h | 9 +++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 9ae86bd3d457..a67f55e5f755 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -374,9 +374,9 @@ static int ena_xdp_xmit(struct net_device *dev, int n, static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) { + u32 verdict = ENA_XDP_PASS; struct bpf_prog *xdp_prog; struct ena_ring *xdp_ring; - u32 verdict = XDP_PASS; struct xdp_frame *xdpf; u64 *xdp_stat; @@ -393,7 +393,7 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) if (unlikely(!xdpf)) { trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; - verdict = XDP_ABORTED; + verdict = ENA_XDP_DROP; break; } @@ -409,29 +409,35 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) spin_unlock(&xdp_ring->xdp_tx_lock); xdp_stat = &rx_ring->rx_stats.xdp_tx; + verdict = ENA_XDP_TX; break; case XDP_REDIRECT: if (likely(!xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))) { xdp_stat = &rx_ring->rx_stats.xdp_redirect; + verdict = ENA_XDP_REDIRECT; break; } trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; - verdict = XDP_ABORTED; + verdict = ENA_XDP_DROP; break; case XDP_ABORTED: trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_aborted; + verdict = ENA_XDP_DROP; break; case XDP_DROP: xdp_stat = &rx_ring->rx_stats.xdp_drop; + verdict = ENA_XDP_DROP; break; case XDP_PASS: xdp_stat = &rx_ring->rx_stats.xdp_pass; + verdict = ENA_XDP_PASS; break; default: bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_invalid; + verdict = ENA_XDP_DROP; } ena_increase_stat(xdp_stat, 1, &rx_ring->syncp); @@ -1621,12 +1627,12 @@ static int ena_xdp_handle_buff(struct ena_ring *rx_ring, struct xdp_buff *xdp) * we expect, then we simply drop it */ if (unlikely(rx_ring->ena_bufs[0].len > ENA_XDP_MAX_MTU)) - return XDP_DROP; + return ENA_XDP_DROP; ret = ena_xdp_execute(rx_ring, xdp); /* The xdp program might expand the headers */ - if (ret == XDP_PASS) { + if (ret == ENA_XDP_PASS) { rx_info->page_offset = xdp->data - xdp->data_hard_start; rx_ring->ena_bufs[0].len = xdp->data_end - xdp->data; } @@ -1665,7 +1671,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, xdp_init_buff(&xdp, ENA_PAGE_SIZE, &rx_ring->xdp_rxq); do { - xdp_verdict = XDP_PASS; + xdp_verdict = ENA_XDP_PASS; skb = NULL; ena_rx_ctx.ena_bufs = rx_ring->ena_bufs; ena_rx_ctx.max_bufs = rx_ring->sgl_size; @@ -1693,7 +1699,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, xdp_verdict = ena_xdp_handle_buff(rx_ring, &xdp); /* allocate skb and fill it */ - if (xdp_verdict == XDP_PASS) + if (xdp_verdict == ENA_XDP_PASS) skb = ena_rx_skb(rx_ring, rx_ring->ena_bufs, ena_rx_ctx.descs, @@ -1711,13 +1717,13 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, /* Packets was passed for transmission, unmap it * from RX side. */ - if (xdp_verdict == XDP_TX || xdp_verdict == XDP_REDIRECT) { + if (xdp_verdict & ENA_XDP_FORWARDED) { ena_unmap_rx_buff(rx_ring, &rx_ring->rx_buffer_info[req_id]); rx_ring->rx_buffer_info[req_id].page = NULL; } } - if (xdp_verdict != XDP_PASS) { + if (xdp_verdict != ENA_XDP_PASS) { xdp_flags |= xdp_verdict; total_len += ena_rx_ctx.ena_bufs[0].len; res_budget--; @@ -1763,7 +1769,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, ena_refill_rx_bufs(rx_ring, refill_required); } - if (xdp_flags & XDP_REDIRECT) + if (xdp_flags & ENA_XDP_REDIRECT) xdp_do_flush_map(); return work_done; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 1bdce99bf688..290ae9bf47ee 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -409,6 +409,15 @@ enum ena_xdp_errors_t { ENA_XDP_NO_ENOUGH_QUEUES, }; +enum ENA_XDP_ACTIONS { + ENA_XDP_PASS = 0, + ENA_XDP_TX = BIT(0), + ENA_XDP_REDIRECT = BIT(1), + ENA_XDP_DROP = BIT(2) +}; + +#define ENA_XDP_FORWARDED (ENA_XDP_TX | ENA_XDP_REDIRECT) + static inline bool ena_xdp_present(struct ena_adapter *adapter) { return !!adapter->xdp_bpf_prog; -- cgit From c7062aaee099f2f43d6f07a71744b44b94b94b34 Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:09 +0000 Subject: net: ena: Fix rx_copybreak value update Make the upper bound on rx_copybreak tighter, by making sure it is smaller than the minimum of mtu and ENA_PAGE_SIZE. With the current upper bound of mtu, rx_copybreak can be larger than a page. Such large rx_copybreak will not bring any performance benefit to the user and therefore makes no sense. In addition, the value update was only reflected in the adapter structure, but not applied for each ring, causing it to not take effect. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Osama Abboud Signed-off-by: Arthur Kiyanovski Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 6 +----- drivers/net/ethernet/amazon/ena/ena_netdev.c | 18 ++++++++++++++++++ drivers/net/ethernet/amazon/ena/ena_netdev.h | 2 ++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 48ae6d810f8f..8da79eedc057 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -887,11 +887,7 @@ static int ena_set_tunable(struct net_device *netdev, switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: len = *(u32 *)data; - if (len > adapter->netdev->mtu) { - ret = -EINVAL; - break; - } - adapter->rx_copybreak = len; + ret = ena_set_rx_copybreak(adapter, len); break; default: ret = -EINVAL; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index a67f55e5f755..80a726932e81 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -2814,6 +2814,24 @@ int ena_update_queue_sizes(struct ena_adapter *adapter, return dev_was_up ? ena_up(adapter) : 0; } +int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak) +{ + struct ena_ring *rx_ring; + int i; + + if (rx_copybreak > min_t(u16, adapter->netdev->mtu, ENA_PAGE_SIZE)) + return -EINVAL; + + adapter->rx_copybreak = rx_copybreak; + + for (i = 0; i < adapter->num_io_queues; i++) { + rx_ring = &adapter->rx_ring[i]; + rx_ring->rx_copybreak = rx_copybreak; + } + + return 0; +} + int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count) { struct ena_com_dev *ena_dev = adapter->ena_dev; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 290ae9bf47ee..f9d862b630fa 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -392,6 +392,8 @@ int ena_update_queue_sizes(struct ena_adapter *adapter, int ena_update_queue_count(struct ena_adapter *adapter, u32 new_channel_count); +int ena_set_rx_copybreak(struct ena_adapter *adapter, u32 rx_copybreak); + int ena_get_sset_count(struct net_device *netdev, int sset); static inline void ena_reset_device(struct ena_adapter *adapter, -- cgit From e712f3e4920b3a1a5e6b536827d118e14862896c Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:10 +0000 Subject: net: ena: Set default value for RX interrupt moderation RX ring can be NULL in XDP use cases where only TX queues are configured. In this scenario, the RX interrupt moderation value sent to the device remains in its default value of 0. In this change, setting the default value of the RX interrupt moderation to be the same as of the TX. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 80a726932e81..99f80c2d560a 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1823,8 +1823,9 @@ static void ena_adjust_adaptive_rx_intr_moderation(struct ena_napi *ena_napi) static void ena_unmask_interrupt(struct ena_ring *tx_ring, struct ena_ring *rx_ring) { + u32 rx_interval = tx_ring->smoothed_interval; struct ena_eth_io_intr_reg intr_reg; - u32 rx_interval = 0; + /* Rx ring can be NULL when for XDP tx queues which don't have an * accompanying rx_ring pair. */ -- cgit From a8ee104f986e720cea52133885cc822d459398c7 Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Thu, 29 Dec 2022 07:30:11 +0000 Subject: net: ena: Update NUMA TPH hint register upon NUMA node update The device supports a PCIe optimization hint, which indicates on which NUMA the queue is currently processed. This hint is utilized by PCIe in order to reduce its access time by accessing the correct NUMA resources and maintaining cache coherence. The driver calls the register update for the hint (called TPH - TLP Processing Hint) during the NAPI loop. Though the update is expected upon a NUMA change (when a queue is moved from one NUMA to the other), the current logic performs a register update when the queue is moved to a different CPU, but the CPU is not necessarily in a different NUMA. The changes include: 1. Performing the TPH update only when the queue has switched a NUMA node. 2. Moving the TPH update call to be triggered only when NAPI was scheduled from interrupt context, as opposed to a busy-polling loop. This is due to the fact that during busy-polling, the frequency of CPU switches for a particular queue is significantly higher, thus, the likelihood to switch NUMA is much higher. Therefore, providing the frequent updates to the device upon a NUMA update are unlikely to be beneficial. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: David Arinzon Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 27 ++++++++++++++++++--------- drivers/net/ethernet/amazon/ena/ena_netdev.h | 6 ++++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 99f80c2d560a..e8ad5ea31aff 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -680,6 +680,7 @@ static void ena_init_io_rings_common(struct ena_adapter *adapter, ring->ena_dev = adapter->ena_dev; ring->per_napi_packets = 0; ring->cpu = 0; + ring->numa_node = 0; ring->no_interrupt_event_cnt = 0; u64_stats_init(&ring->syncp); } @@ -783,6 +784,7 @@ static int ena_setup_tx_resources(struct ena_adapter *adapter, int qid) tx_ring->next_to_use = 0; tx_ring->next_to_clean = 0; tx_ring->cpu = ena_irq->cpu; + tx_ring->numa_node = node; return 0; err_push_buf_intermediate_buf: @@ -915,6 +917,7 @@ static int ena_setup_rx_resources(struct ena_adapter *adapter, rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; rx_ring->cpu = ena_irq->cpu; + rx_ring->numa_node = node; return 0; } @@ -1863,20 +1866,27 @@ static void ena_update_ring_numa_node(struct ena_ring *tx_ring, if (likely(tx_ring->cpu == cpu)) goto out; + tx_ring->cpu = cpu; + if (rx_ring) + rx_ring->cpu = cpu; + numa_node = cpu_to_node(cpu); + + if (likely(tx_ring->numa_node == numa_node)) + goto out; + put_cpu(); if (numa_node != NUMA_NO_NODE) { ena_com_update_numa_node(tx_ring->ena_com_io_cq, numa_node); - if (rx_ring) + tx_ring->numa_node = numa_node; + if (rx_ring) { + rx_ring->numa_node = numa_node; ena_com_update_numa_node(rx_ring->ena_com_io_cq, numa_node); + } } - tx_ring->cpu = cpu; - if (rx_ring) - rx_ring->cpu = cpu; - return; out: put_cpu(); @@ -1997,11 +2007,10 @@ static int ena_io_poll(struct napi_struct *napi, int budget) if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev)) ena_adjust_adaptive_rx_intr_moderation(ena_napi); + ena_update_ring_numa_node(tx_ring, rx_ring); ena_unmask_interrupt(tx_ring, rx_ring); } - ena_update_ring_numa_node(tx_ring, rx_ring); - ret = rx_work_done; } else { ret = budget; @@ -2386,7 +2395,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid) ctx.mem_queue_type = ena_dev->tx_mem_queue_type; ctx.msix_vector = msix_vector; ctx.queue_size = tx_ring->ring_size; - ctx.numa_node = cpu_to_node(tx_ring->cpu); + ctx.numa_node = tx_ring->numa_node; rc = ena_com_create_io_queue(ena_dev, &ctx); if (rc) { @@ -2454,7 +2463,7 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid) ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST; ctx.msix_vector = msix_vector; ctx.queue_size = rx_ring->ring_size; - ctx.numa_node = cpu_to_node(rx_ring->cpu); + ctx.numa_node = rx_ring->numa_node; rc = ena_com_create_io_queue(ena_dev, &ctx); if (rc) { diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index f9d862b630fa..2cb141079474 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -262,9 +262,11 @@ struct ena_ring { bool disable_meta_caching; u16 no_interrupt_event_cnt; - /* cpu for TPH */ + /* cpu and NUMA for TPH */ int cpu; - /* number of tx/rx_buffer_info's entries */ + int numa_node; + + /* number of tx/rx_buffer_info's entries */ int ring_size; enum ena_admin_placement_policy_type tx_mem_queue_type; -- cgit From d039535850ee47079d59527e96be18d8e0daa84b Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 29 Dec 2022 10:29:25 +0400 Subject: net: phy: xgmiitorgmii: Fix refcount leak in xgmiitorgmii_probe of_phy_find_device() return device node with refcount incremented. Call put_device() to relese it when not needed anymore. Fixes: ab4e6ee578e8 ("net: phy: xgmiitorgmii: Check phy_driver ready before accessing") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- drivers/net/phy/xilinx_gmii2rgmii.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c index 8dcb49ed1f3d..7fd9fe6a602b 100644 --- a/drivers/net/phy/xilinx_gmii2rgmii.c +++ b/drivers/net/phy/xilinx_gmii2rgmii.c @@ -105,6 +105,7 @@ static int xgmiitorgmii_probe(struct mdio_device *mdiodev) if (!priv->phy_dev->drv) { dev_info(dev, "Attached phy not ready\n"); + put_device(&priv->phy_dev->mdio.dev); return -EPROBE_DEFER; } -- cgit From c5bc073668206c73c20798eb6d978b5e9db5b16f Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Wed, 14 Dec 2022 08:54:39 +0100 Subject: drm/i915: fix TLB invalidation for Gen12.50 video and compute engines In case of Gen12.50 video and compute engines, TLB_INV registers are masked - to modify one bit, corresponding bit in upper half of the register must be enabled, otherwise nothing happens. Fixes: 77fa9efc16a9 ("drm/i915/xehp: Create separate reg definitions for new MCR registers") Signed-off-by: Andrzej Hajda Reviewed-by: Tvrtko Ursulin Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20221214075439.402485-1-andrzej.hajda@intel.com (cherry picked from commit 4d5cf7b1680a1e6db327e3c935ef58325cbedb2c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_gt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 767e329e1cc5..9c18b5f2e789 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -1109,9 +1109,15 @@ static void mmio_invalidate_full(struct intel_gt *gt) continue; if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { + u32 val = BIT(engine->instance); + + if (engine->class == VIDEO_DECODE_CLASS || + engine->class == VIDEO_ENHANCEMENT_CLASS || + engine->class == COMPUTE_CLASS) + val = _MASKED_BIT_ENABLE(val); intel_gt_mcr_multicast_write_fw(gt, xehp_regs[engine->class], - BIT(engine->instance)); + val); } else { rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); if (!i915_mmio_reg_offset(rb.reg)) -- cgit From fff758698842fb6722be37498d8773e0fb47f000 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 14 Dec 2022 11:49:44 -0800 Subject: drm/i915: Remove __maybe_unused from mtl_info The attribute __maybe_unused should remain only until the respective info is not in the pciidlist. The info can't be added together with its definition because that would cause the driver to automatically probe for the device, while it's still not ready for that. However once pciidlist contains it, the attribute can be removed. Fixes: 7835303982d1 ("drm/i915/mtl: Add MeteorLake PCI IDs") Signed-off-by: Lucas De Marchi Reviewed-by: Radhakrishna Sripada Link: https://patchwork.freedesktop.org/patch/msgid/20221214194944.3670344-1-lucas.demarchi@intel.com (cherry picked from commit 50490ce05b7a50b0bd4108fa7d6db3ca2972fa83) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 6da9784fe4a2..ccd1f864aa19 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -1129,7 +1129,6 @@ static const struct intel_gt_definition xelpmp_extra_gt[] = { {} }; -__maybe_unused static const struct intel_device_info mtl_info = { XE_HP_FEATURES, XE_LPDP_FEATURES, -- cgit From 3f882f2d4f689627c1566c2c92087bc3ff734953 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 16 Dec 2022 11:34:56 +0000 Subject: drm/i915: improve the catch-all evict to handle lock contention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The catch-all evict can fail due to object lock contention, since it only goes as far as trylocking the object, due to us already holding the vm->mutex. Doing a full object lock here can deadlock, since the vm->mutex is always our inner lock. Add another execbuf pass which drops the vm->mutex and then tries to grab the object will the full lock, before then retrying the eviction. This should be good enough for now to fix the immediate regression with userspace seeing -ENOSPC from execbuf due to contended object locks during GTT eviction. v2 (Mani) - Also revamp the docs for the different passes. Testcase: igt@gem_ppgtt@shrink-vs-evict-* Fixes: 7e00897be8bf ("drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something, v2.") References: https://gitlab.freedesktop.org/drm/intel/-/issues/7627 References: https://gitlab.freedesktop.org/drm/intel/-/issues/7570 References: https://bugzilla.mozilla.org/show_bug.cgi?id=1779558 Signed-off-by: Matthew Auld Cc: Maarten Lankhorst Cc: Thomas Hellström Cc: Tvrtko Ursulin Cc: Andrzej Hajda Cc: Mani Milani Cc: # v5.18+ Reviewed-by: Mani Milani Tested-by: Mani Milani Link: https://patchwork.freedesktop.org/patch/msgid/20221216113456.414183-1-matthew.auld@intel.com (cherry picked from commit 801fa7a81f6da533cc5442fc40e32c72b76cd42a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 59 ++++++++++++++++++++----- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +- drivers/gpu/drm/i915/i915_gem_evict.c | 37 +++++++++++----- drivers/gpu/drm/i915/i915_gem_evict.h | 4 +- drivers/gpu/drm/i915/i915_vma.c | 2 +- drivers/gpu/drm/i915/selftests/i915_gem_evict.c | 4 +- 6 files changed, 82 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index da09767fda07..f266b68cf012 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -730,32 +730,69 @@ static int eb_reserve(struct i915_execbuffer *eb) bool unpinned; /* - * Attempt to pin all of the buffers into the GTT. - * This is done in 2 phases: + * We have one more buffers that we couldn't bind, which could be due to + * various reasons. To resolve this we have 4 passes, with every next + * level turning the screws tighter: * - * 1. Unbind all objects that do not match the GTT constraints for - * the execbuffer (fenceable, mappable, alignment etc). - * 2. Bind new objects. + * 0. Unbind all objects that do not match the GTT constraints for the + * execbuffer (fenceable, mappable, alignment etc). Bind all new + * objects. This avoids unnecessary unbinding of later objects in order + * to make room for the earlier objects *unless* we need to defragment. * - * This avoid unnecessary unbinding of later objects in order to make - * room for the earlier objects *unless* we need to defragment. + * 1. Reorder the buffers, where objects with the most restrictive + * placement requirements go first (ignoring fixed location buffers for + * now). For example, objects needing the mappable aperture (the first + * 256M of GTT), should go first vs objects that can be placed just + * about anywhere. Repeat the previous pass. * - * Defragmenting is skipped if all objects are pinned at a fixed location. + * 2. Consider buffers that are pinned at a fixed location. Also try to + * evict the entire VM this time, leaving only objects that we were + * unable to lock. Try again to bind the buffers. (still using the new + * buffer order). + * + * 3. We likely have object lock contention for one or more stubborn + * objects in the VM, for which we need to evict to make forward + * progress (perhaps we are fighting the shrinker?). When evicting the + * VM this time around, anything that we can't lock we now track using + * the busy_bo, using the full lock (after dropping the vm->mutex to + * prevent deadlocks), instead of trylock. We then continue to evict the + * VM, this time with the stubborn object locked, which we can now + * hopefully unbind (if still bound in the VM). Repeat until the VM is + * evicted. Finally we should be able bind everything. */ - for (pass = 0; pass <= 2; pass++) { + for (pass = 0; pass <= 3; pass++) { int pin_flags = PIN_USER | PIN_VALIDATE; if (pass == 0) pin_flags |= PIN_NONBLOCK; if (pass >= 1) - unpinned = eb_unbind(eb, pass == 2); + unpinned = eb_unbind(eb, pass >= 2); if (pass == 2) { err = mutex_lock_interruptible(&eb->context->vm->mutex); if (!err) { - err = i915_gem_evict_vm(eb->context->vm, &eb->ww); + err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL); + mutex_unlock(&eb->context->vm->mutex); + } + if (err) + return err; + } + + if (pass == 3) { +retry: + err = mutex_lock_interruptible(&eb->context->vm->mutex); + if (!err) { + struct drm_i915_gem_object *busy_bo = NULL; + + err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo); mutex_unlock(&eb->context->vm->mutex); + if (err && busy_bo) { + err = i915_gem_object_lock(busy_bo, &eb->ww); + i915_gem_object_put(busy_bo); + if (!err) + goto retry; + } } if (err) return err; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index c29efdef8313..0ad44f3868de 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -369,7 +369,7 @@ retry: if (vma == ERR_PTR(-ENOSPC)) { ret = mutex_lock_interruptible(&ggtt->vm.mutex); if (!ret) { - ret = i915_gem_evict_vm(&ggtt->vm, &ww); + ret = i915_gem_evict_vm(&ggtt->vm, &ww, NULL); mutex_unlock(&ggtt->vm.mutex); } if (ret) diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c index f025ee4fa526..a4b4d9b7d26c 100644 --- a/drivers/gpu/drm/i915/i915_gem_evict.c +++ b/drivers/gpu/drm/i915/i915_gem_evict.c @@ -416,6 +416,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, * @vm: Address space to cleanse * @ww: An optional struct i915_gem_ww_ctx. If not NULL, i915_gem_evict_vm * will be able to evict vma's locked by the ww as well. + * @busy_bo: Optional pointer to struct drm_i915_gem_object. If not NULL, then + * in the event i915_gem_evict_vm() is unable to trylock an object for eviction, + * then @busy_bo will point to it. -EBUSY is also returned. The caller must drop + * the vm->mutex, before trying again to acquire the contended lock. The caller + * also owns a reference to the object. * * This function evicts all vmas from a vm. * @@ -425,7 +430,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, * To clarify: This is for freeing up virtual address space, not for freeing * memory in e.g. the shrinker. */ -int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) +int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww, + struct drm_i915_gem_object **busy_bo) { int ret = 0; @@ -457,15 +463,22 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) * the resv is shared among multiple objects, we still * need the object ref. */ - if (dying_vma(vma) || + if (!i915_gem_object_get_rcu(vma->obj) || (ww && (dma_resv_locking_ctx(vma->obj->base.resv) == &ww->ctx))) { __i915_vma_pin(vma); list_add(&vma->evict_link, &locked_eviction_list); continue; } - if (!i915_gem_object_trylock(vma->obj, ww)) + if (!i915_gem_object_trylock(vma->obj, ww)) { + if (busy_bo) { + *busy_bo = vma->obj; /* holds ref */ + ret = -EBUSY; + break; + } + i915_gem_object_put(vma->obj); continue; + } __i915_vma_pin(vma); list_add(&vma->evict_link, &eviction_list); @@ -473,25 +486,29 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) if (list_empty(&eviction_list) && list_empty(&locked_eviction_list)) break; - ret = 0; /* Unbind locked objects first, before unlocking the eviction_list */ list_for_each_entry_safe(vma, vn, &locked_eviction_list, evict_link) { __i915_vma_unpin(vma); - if (ret == 0) + if (ret == 0) { ret = __i915_vma_unbind(vma); - if (ret != -EINTR) /* "Get me out of here!" */ - ret = 0; + if (ret != -EINTR) /* "Get me out of here!" */ + ret = 0; + } + if (!dying_vma(vma)) + i915_gem_object_put(vma->obj); } list_for_each_entry_safe(vma, vn, &eviction_list, evict_link) { __i915_vma_unpin(vma); - if (ret == 0) + if (ret == 0) { ret = __i915_vma_unbind(vma); - if (ret != -EINTR) /* "Get me out of here!" */ - ret = 0; + if (ret != -EINTR) /* "Get me out of here!" */ + ret = 0; + } i915_gem_object_unlock(vma->obj); + i915_gem_object_put(vma->obj); } } while (ret == 0); diff --git a/drivers/gpu/drm/i915/i915_gem_evict.h b/drivers/gpu/drm/i915/i915_gem_evict.h index e593c530f9bd..bf0ee0e4fe60 100644 --- a/drivers/gpu/drm/i915/i915_gem_evict.h +++ b/drivers/gpu/drm/i915/i915_gem_evict.h @@ -11,6 +11,7 @@ struct drm_mm_node; struct i915_address_space; struct i915_gem_ww_ctx; +struct drm_i915_gem_object; int __must_check i915_gem_evict_something(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww, @@ -23,6 +24,7 @@ int __must_check i915_gem_evict_for_node(struct i915_address_space *vm, struct drm_mm_node *node, unsigned int flags); int i915_gem_evict_vm(struct i915_address_space *vm, - struct i915_gem_ww_ctx *ww); + struct i915_gem_ww_ctx *ww, + struct drm_i915_gem_object **busy_bo); #endif /* __I915_GEM_EVICT_H__ */ diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 703fee6b5f75..3a33be5401ed 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1566,7 +1566,7 @@ static int __i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, * locked objects when called from execbuf when pinning * is removed. This would probably regress badly. */ - i915_gem_evict_vm(vm, NULL); + i915_gem_evict_vm(vm, NULL, NULL); mutex_unlock(&vm->mutex); } } while (1); diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c index 8c6517d29b8e..37068542aafe 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c @@ -344,7 +344,7 @@ static int igt_evict_vm(void *arg) /* Everything is pinned, nothing should happen */ mutex_lock(&ggtt->vm.mutex); - err = i915_gem_evict_vm(&ggtt->vm, NULL); + err = i915_gem_evict_vm(&ggtt->vm, NULL, NULL); mutex_unlock(&ggtt->vm.mutex); if (err) { pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n", @@ -356,7 +356,7 @@ static int igt_evict_vm(void *arg) for_i915_gem_ww(&ww, err, false) { mutex_lock(&ggtt->vm.mutex); - err = i915_gem_evict_vm(&ggtt->vm, &ww); + err = i915_gem_evict_vm(&ggtt->vm, &ww, NULL); mutex_unlock(&ggtt->vm.mutex); } -- cgit From 11ce8fd8fd8718247f17475802639cd7e2d3765c Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 21 Dec 2022 11:30:31 -0800 Subject: drm/i915/uc: Fix two issues with over-size firmware files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case where a firmware file is too large (e.g. someone downloaded a web page ASCII dump from github...), the firmware object is released but the pointer is not zerod. If no other firmware file was found then release would be called again leading to a double kfree. Also, the size check was only being applied to the initial firmware load not any of the subsequent attempts. So move the check into a wrapper that is used for all loads. Fixes: 016241168dc5 ("drm/i915/uc: use different ggtt pin offsets for uc loads") Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Cc: Alan Previn Cc: Rodrigo Vivi Cc: Matt Roper Cc: Jani Nikula Cc: Matthew Auld Cc: "Thomas Hellström" Link: https://patchwork.freedesktop.org/patch/msgid/20221221193031.687266-4-John.C.Harrison@Intel.com (cherry picked from commit 4071d98b296a5bc5fd4b15ec651bd05800ec9510) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 42 +++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index 0c80ba51a4bd..2bcdd192f814 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -545,6 +545,32 @@ static int check_ccs_header(struct intel_gt *gt, return 0; } +static int try_firmware_load(struct intel_uc_fw *uc_fw, const struct firmware **fw) +{ + struct intel_gt *gt = __uc_fw_to_gt(uc_fw); + struct device *dev = gt->i915->drm.dev; + int err; + + err = firmware_request_nowarn(fw, uc_fw->file_selected.path, dev); + + if (err) + return err; + + if ((*fw)->size > INTEL_UC_RSVD_GGTT_PER_FW) { + drm_err(>->i915->drm, + "%s firmware %s: size (%zuKB) exceeds max supported size (%uKB)\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->file_selected.path, + (*fw)->size / SZ_1K, INTEL_UC_RSVD_GGTT_PER_FW / SZ_1K); + + /* try to find another blob to load */ + release_firmware(*fw); + *fw = NULL; + return -ENOENT; + } + + return 0; +} + /** * intel_uc_fw_fetch - fetch uC firmware * @uc_fw: uC firmware @@ -558,7 +584,6 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw) struct intel_gt *gt = __uc_fw_to_gt(uc_fw); struct drm_i915_private *i915 = gt->i915; struct intel_uc_fw_file file_ideal; - struct device *dev = i915->drm.dev; struct drm_i915_gem_object *obj; const struct firmware *fw = NULL; bool old_ver = false; @@ -574,20 +599,9 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw) __force_fw_fetch_failures(uc_fw, -EINVAL); __force_fw_fetch_failures(uc_fw, -ESTALE); - err = firmware_request_nowarn(&fw, uc_fw->file_selected.path, dev); + err = try_firmware_load(uc_fw, &fw); memcpy(&file_ideal, &uc_fw->file_wanted, sizeof(file_ideal)); - if (!err && fw->size > INTEL_UC_RSVD_GGTT_PER_FW) { - drm_err(&i915->drm, - "%s firmware %s: size (%zuKB) exceeds max supported size (%uKB)\n", - intel_uc_fw_type_repr(uc_fw->type), uc_fw->file_selected.path, - fw->size / SZ_1K, INTEL_UC_RSVD_GGTT_PER_FW / SZ_1K); - - /* try to find another blob to load */ - release_firmware(fw); - err = -ENOENT; - } - /* Any error is terminal if overriding. Don't bother searching for older versions */ if (err && intel_uc_fw_is_overridden(uc_fw)) goto fail; @@ -608,7 +622,7 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw) break; } - err = firmware_request_nowarn(&fw, uc_fw->file_selected.path, dev); + err = try_firmware_load(uc_fw, &fw); } if (err) -- cgit From 99cb0d917ffa1ab628bb67364ca9b162c07699b1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 27 Dec 2022 03:45:37 +0900 Subject: arch: fix broken BuildID for arm64 and riscv Dennis Gilmore reports that the BuildID is missing in the arm64 vmlinux since commit 994b7ac1697b ("arm64: remove special treatment for the link order of head.o"). The issue is that the type of .notes section, which contains the BuildID, changed from NOTES to PROGBITS. Ard Biesheuvel figured out that whichever object gets linked first gets to decide the type of a section. The PROGBITS type is the result of the compiler emitting .note.GNU-stack as PROGBITS rather than NOTE. While Ard provided a fix for arm64, I want to fix this globally because the same issue is happening on riscv since commit 2348e6bf4421 ("riscv: remove special treatment for the link order of head.o"). This problem will happen in general for other architectures if they start to drop unneeded entries from scripts/head-object-list.txt. Discard .note.GNU-stack in include/asm-generic/vmlinux.lds.h. Link: https://lore.kernel.org/lkml/CAABkxwuQoz1CTbyb57n0ZX65eSYiTonFCU8-LCQc=74D=xE=rA@mail.gmail.com/ Fixes: 994b7ac1697b ("arm64: remove special treatment for the link order of head.o") Fixes: 2348e6bf4421 ("riscv: remove special treatment for the link order of head.o") Reported-by: Dennis Gilmore Suggested-by: Ard Biesheuvel Signed-off-by: Masahiro Yamada Acked-by: Palmer Dabbelt --- include/asm-generic/vmlinux.lds.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index a94219e9916f..659bf3b31c91 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -891,7 +891,12 @@ #define PRINTK_INDEX #endif +/* + * Discard .note.GNU-stack, which is emitted as PROGBITS by the compiler. + * Otherwise, the type of .notes section would become PROGBITS instead of NOTES. + */ #define NOTES \ + /DISCARD/ : { *(.note.GNU-stack) } \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.note.*, _notes) \ } NOTES_HEADERS \ -- cgit From 924d28b39e3b62ad5e97751585aed7c89f8c43ee Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 27 Dec 2022 03:54:44 +0900 Subject: .gitignore: ignore *.rpm Previously, *.rpm files were created under $HOME/rpmbuild/, but since commit 8818039f959b ("kbuild: add ability to make source rpm buildable using koji"), srcrpm-pkg creates the source rpm in the kernel tree because it sets '_srcrpmdir'. Signed-off-by: Masahiro Yamada --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3ec73ead6757..20dce5c3b9e0 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ *.o.* *.patch *.rmeta +*.rpm *.rsi *.s *.so -- cgit From 9c9b55a59416a87fc73c479d78cb3218076dbc30 Mon Sep 17 00:00:00 2001 From: Jun ASAKA Date: Tue, 27 Dec 2022 17:21:57 +0800 Subject: kbuild: add a missing line for help message The help message line for building the source RPM package was missing. Added it. Signed-off-by: Jun ASAKA Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/Makefile.package | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 539e9f765d64..525a2820976f 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -158,6 +158,7 @@ $(perf-tar-pkgs): PHONY += help help: @echo ' rpm-pkg - Build both source and binary RPM kernel packages' + @echo ' srcrpm-pkg - Build only the source kernel RPM package' @echo ' binrpm-pkg - Build only the binary kernel RPM package' @echo ' deb-pkg - Build both source and binary deb kernel packages' @echo ' bindeb-pkg - Build only the binary kernel deb package' -- cgit From 63ffe00d8c939eda1a8fa87484ca4537e13a20b7 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 27 Dec 2022 15:48:21 -0600 Subject: kbuild: Fix running modpost with musl libc commit 3d57e1b7b1d4 ("kbuild: refactor the prerequisites of the modpost rule") moved 'vmlinux.o' inside modpost-args, possibly before some of the other options. However, getopt() in musl libc follows POSIX and stops looking for options upon reaching the first non-option argument. As a result, the '-T' option is misinterpreted as a positional argument, and the build fails: make -f ./scripts/Makefile.modpost scripts/mod/modpost -E -o Module.symvers vmlinux.o -T modules.order -T: No such file or directory make[1]: *** [scripts/Makefile.modpost:137: Module.symvers] Error 1 make: *** [Makefile:1960: modpost] Error 2 The fix is to move all options before 'vmlinux.o' in modpost-args. Fixes: 3d57e1b7b1d4 ("kbuild: refactor the prerequisites of the modpost rule") Signed-off-by: Samuel Holland Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 5eb5e8280379..0ee296cf520c 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -55,6 +55,17 @@ ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) modpost-args += -n endif +ifneq ($(KBUILD_MODPOST_WARN)$(missing-input),) +modpost-args += -w +endif + +# Read out modules.order to pass in modpost. +# Otherwise, allmodconfig would fail with "Argument list too long". +ifdef KBUILD_MODULES +modpost-args += -T $(MODORDER) +modpost-deps += $(MODORDER) +endif + ifeq ($(KBUILD_EXTMOD),) # Generate the list of in-tree objects in vmlinux @@ -113,17 +124,6 @@ modpost-args += -e $(addprefix -i , $(KBUILD_EXTRA_SYMBOLS)) endif # ($(KBUILD_EXTMOD),) -ifneq ($(KBUILD_MODPOST_WARN)$(missing-input),) -modpost-args += -w -endif - -ifdef KBUILD_MODULES -modpost-args += -T $(MODORDER) -modpost-deps += $(MODORDER) -endif - -# Read out modules.order to pass in modpost. -# Otherwise, allmodconfig would fail with "Argument list too long". quiet_cmd_modpost = MODPOST $@ cmd_modpost = \ $(if $(missing-input), \ -- cgit From 02a893bc99757d75b7abb43b74f210dfa3df8c4b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Dec 2022 04:10:14 +0900 Subject: kbuild: rpm-pkg: add libelf-devel as alternative for BuildRequires Guoqing Jiang reports that openSUSE cannot compile the kernel rpm due to "BuildRequires: elfutils-libelf-devel" added by commit 8818039f959b ("kbuild: add ability to make source rpm buildable using koji"). The relevant package name in openSUSE is libelf-devel. Add it as an alternative package. BTW, if it is impossible to solve the build requirement, the final resort would be: $ make RPMOPTS=--nodeps rpm-pkg This passes --nodeps to the rpmbuild command so it will not verify build dependencies. This is useful to test rpm builds on non-rpm system. On Debian/Ubuntu, for example, you can install rpmbuild by 'apt-get install rpm'. NOTE1: Likewise, it is possible to bypass the build dependency check for debian package builds: $ make DPKG_FLAGS=-d deb-pkg NOTE2: The 'or' operator is supported since RPM 4.13. So, old distros such as CentOS 7 will break. I suggest installing newer rpmbuild in such cases. Link: https://lore.kernel.org/linux-kbuild/ee227d24-9c94-bfa3-166a-4ee6b5dfea09@linux.dev/T/#u Fixes: 8818039f959b ("kbuild: add ability to make source rpm buildable using koji") Reported-by: Guoqing Jiang Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor Tested-by: Guoqing Jiang Acked-by: Jonathan Toppins --- scripts/package/mkspec | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index dda00a948a01..adab28fa7f89 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -51,7 +51,8 @@ sed -e '/^DEL/d' -e 's/^\t*//' < Date: Thu, 29 Dec 2022 21:16:42 +0900 Subject: kbuild: sort single-targets alphabetically again This was previously alphabetically sorted. Sort it again. Signed-off-by: Masahiro Yamada Reviewed-by: Miguel Ojeda Reviewed-by: Nathan Chancellor --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d4b6af8c09e9..a5133e422f69 100644 --- a/Makefile +++ b/Makefile @@ -297,7 +297,7 @@ no-compiler-targets := $(no-dot-config-targets) install dtbs_install \ headers_install modules_install kernelrelease image_name no-sync-config-targets := $(no-dot-config-targets) %install kernelrelease \ image_name -single-targets := %.a %.i %.rsi %.ko %.lds %.ll %.lst %.mod %.o %.s %.symtypes %/ +single-targets := %.a %.i %.ko %.lds %.ll %.lst %.mod %.o %.rsi %.s %.symtypes %/ config-build := mixed-build := -- cgit From 6a5e25fc3e0b94301734e8abb1d311a1e02d360d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 30 Dec 2022 17:16:42 +0900 Subject: fixdep: remove unneeded inclusion This is unneeded since commit 69304379ff03 ("fixdep: use fflush() and ferror() to ensure successful write to files"). Signed-off-by: Masahiro Yamada --- scripts/basic/fixdep.c | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c index 2328f9a641da..f932aeaba71a 100644 --- a/scripts/basic/fixdep.c +++ b/scripts/basic/fixdep.c @@ -94,7 +94,6 @@ #include #include #include -#include #include #include #include -- cgit From 963bbdb32b47cfa67a449e715e1dcc525fbd01fc Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 19 Dec 2022 12:59:55 +0200 Subject: drm/i915/dsi: add support for ICL+ native MIPI GPIO sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting from ICL, the default for MIPI GPIO sequences seems to be using native GPIOs i.e. GPIOs available in the GPU. These native GPIOs reuse many pins that quite frankly seem scary to poke based on the VBT sequences. We pretty much have to trust that the board is configured such that the relevant HPD, PP_CONTROL and GPIO bits aren't used for anything else. MIPI sequence v4 also adds a flag to fall back to non-native sequences. v5: - Wrap SHOTPLUG_CTL_DDI modification in spin_lock() in icp_irq_handler() too (Ville) - References instead of Closes issue 6131 because this does not fix everything v4: - Wrap SHOTPLUG_CTL_DDI modification in spin_lock_irq() (Ville) v3: - Fix -Wbitwise-conditional-parentheses (kernel test robot ) v2: - Fix HPD pin output set (impacts GPIOs 0 and 5) - Fix GPIO data output direction set (impacts GPIOs 4 and 9) - Reduce register accesses to single intel_de_rwm() References: https://gitlab.freedesktop.org/drm/intel/-/issues/6131 Cc: Ville Syrjälä Signed-off-by: Jani Nikula Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221219105955.4014451-1-jani.nikula@intel.com (cherry picked from commit f087cfe6fcff58044f7aa3b284965af47f472fb0) Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dsi_vbt.c | 94 +++++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_irq.c | 3 + drivers/gpu/drm/i915/i915_reg.h | 1 + 3 files changed, 95 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c index fce69fa446d5..41f025f089d9 100644 --- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c @@ -41,9 +41,11 @@ #include "i915_drv.h" #include "i915_reg.h" +#include "intel_de.h" #include "intel_display_types.h" #include "intel_dsi.h" #include "intel_dsi_vbt.h" +#include "intel_gmbus_regs.h" #include "vlv_dsi.h" #include "vlv_dsi_regs.h" #include "vlv_sideband.h" @@ -377,6 +379,85 @@ static void icl_exec_gpio(struct intel_connector *connector, drm_dbg_kms(&dev_priv->drm, "Skipping ICL GPIO element execution\n"); } +enum { + MIPI_RESET_1 = 0, + MIPI_AVDD_EN_1, + MIPI_BKLT_EN_1, + MIPI_AVEE_EN_1, + MIPI_VIO_EN_1, + MIPI_RESET_2, + MIPI_AVDD_EN_2, + MIPI_BKLT_EN_2, + MIPI_AVEE_EN_2, + MIPI_VIO_EN_2, +}; + +static void icl_native_gpio_set_value(struct drm_i915_private *dev_priv, + int gpio, bool value) +{ + int index; + + if (drm_WARN_ON(&dev_priv->drm, DISPLAY_VER(dev_priv) == 11 && gpio >= MIPI_RESET_2)) + return; + + switch (gpio) { + case MIPI_RESET_1: + case MIPI_RESET_2: + index = gpio == MIPI_RESET_1 ? HPD_PORT_A : HPD_PORT_B; + + /* + * Disable HPD to set the pin to output, and set output + * value. The HPD pin should not be enabled for DSI anyway, + * assuming the board design and VBT are sane, and the pin isn't + * used by a non-DSI encoder. + * + * The locking protects against concurrent SHOTPLUG_CTL_DDI + * modifications in irq setup and handling. + */ + spin_lock_irq(&dev_priv->irq_lock); + intel_de_rmw(dev_priv, SHOTPLUG_CTL_DDI, + SHOTPLUG_CTL_DDI_HPD_ENABLE(index) | + SHOTPLUG_CTL_DDI_HPD_OUTPUT_DATA(index), + value ? SHOTPLUG_CTL_DDI_HPD_OUTPUT_DATA(index) : 0); + spin_unlock_irq(&dev_priv->irq_lock); + break; + case MIPI_AVDD_EN_1: + case MIPI_AVDD_EN_2: + index = gpio == MIPI_AVDD_EN_1 ? 0 : 1; + + intel_de_rmw(dev_priv, PP_CONTROL(index), PANEL_POWER_ON, + value ? PANEL_POWER_ON : 0); + break; + case MIPI_BKLT_EN_1: + case MIPI_BKLT_EN_2: + index = gpio == MIPI_AVDD_EN_1 ? 0 : 1; + + intel_de_rmw(dev_priv, PP_CONTROL(index), EDP_BLC_ENABLE, + value ? EDP_BLC_ENABLE : 0); + break; + case MIPI_AVEE_EN_1: + case MIPI_AVEE_EN_2: + index = gpio == MIPI_AVEE_EN_1 ? 1 : 2; + + intel_de_rmw(dev_priv, GPIO(dev_priv, index), + GPIO_CLOCK_VAL_OUT, + GPIO_CLOCK_DIR_MASK | GPIO_CLOCK_DIR_OUT | + GPIO_CLOCK_VAL_MASK | (value ? GPIO_CLOCK_VAL_OUT : 0)); + break; + case MIPI_VIO_EN_1: + case MIPI_VIO_EN_2: + index = gpio == MIPI_VIO_EN_1 ? 1 : 2; + + intel_de_rmw(dev_priv, GPIO(dev_priv, index), + GPIO_DATA_VAL_OUT, + GPIO_DATA_DIR_MASK | GPIO_DATA_DIR_OUT | + GPIO_DATA_VAL_MASK | (value ? GPIO_DATA_VAL_OUT : 0)); + break; + default: + MISSING_CASE(gpio); + } +} + static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data) { struct drm_device *dev = intel_dsi->base.base.dev; @@ -384,8 +465,7 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data) struct intel_connector *connector = intel_dsi->attached_connector; u8 gpio_source, gpio_index = 0, gpio_number; bool value; - - drm_dbg_kms(&dev_priv->drm, "\n"); + bool native = DISPLAY_VER(dev_priv) >= 11; if (connector->panel.vbt.dsi.seq_version >= 3) gpio_index = *data++; @@ -398,10 +478,18 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data) else gpio_source = 0; + if (connector->panel.vbt.dsi.seq_version >= 4 && *data & BIT(1)) + native = false; + /* pull up/down */ value = *data++ & 1; - if (DISPLAY_VER(dev_priv) >= 11) + drm_dbg_kms(&dev_priv->drm, "GPIO index %u, number %u, source %u, native %s, set to %s\n", + gpio_index, gpio_number, gpio_source, str_yes_no(native), str_on_off(value)); + + if (native) + icl_native_gpio_set_value(dev_priv, gpio_number, value); + else if (DISPLAY_VER(dev_priv) >= 11) icl_exec_gpio(connector, gpio_source, gpio_index, value); else if (IS_VALLEYVIEW(dev_priv)) vlv_exec_gpio(connector, gpio_source, gpio_number, value); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index edfe363af838..91c533986041 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1974,7 +1974,10 @@ static void icp_irq_handler(struct drm_i915_private *dev_priv, u32 pch_iir) if (ddi_hotplug_trigger) { u32 dig_hotplug_reg; + /* Locking due to DSI native GPIO sequences */ + spin_lock(&dev_priv->irq_lock); dig_hotplug_reg = intel_uncore_rmw(&dev_priv->uncore, SHOTPLUG_CTL_DDI, 0, 0); + spin_unlock(&dev_priv->irq_lock); intel_get_hpd_pins(dev_priv, &pin_mask, &long_mask, ddi_hotplug_trigger, dig_hotplug_reg, diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 8e1892d14774..916176872544 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -5988,6 +5988,7 @@ #define SHOTPLUG_CTL_DDI _MMIO(0xc4030) #define SHOTPLUG_CTL_DDI_HPD_ENABLE(hpd_pin) (0x8 << (_HPD_PIN_DDI(hpd_pin) * 4)) +#define SHOTPLUG_CTL_DDI_HPD_OUTPUT_DATA(hpd_pin) (0x4 << (_HPD_PIN_DDI(hpd_pin) * 4)) #define SHOTPLUG_CTL_DDI_HPD_STATUS_MASK(hpd_pin) (0x3 << (_HPD_PIN_DDI(hpd_pin) * 4)) #define SHOTPLUG_CTL_DDI_HPD_NO_DETECT(hpd_pin) (0x0 << (_HPD_PIN_DDI(hpd_pin) * 4)) #define SHOTPLUG_CTL_DDI_HPD_SHORT_DETECT(hpd_pin) (0x1 << (_HPD_PIN_DDI(hpd_pin) * 4)) -- cgit From 6217e9f05a74df48c77ee68993d587cdfdb1feb7 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 20 Dec 2022 16:01:05 +0200 Subject: drm/i915/dsi: fix MIPI_BKLT_EN_1 native GPIO index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to copy-paste fail, MIPI_BKLT_EN_1 would always use PPS index 1, never 0. Fix the sloppiest commit in recent memory. Fixes: 963bbdb32b47 ("drm/i915/dsi: add support for ICL+ native MIPI GPIO sequence") Reported-by: Ville Syrjälä Signed-off-by: Jani Nikula Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20221220140105.313333-1-jani.nikula@intel.com (cherry picked from commit a561933c571798868b5fa42198427a7e6df56c09) Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dsi_vbt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c index 41f025f089d9..2cbc1292ab38 100644 --- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c @@ -430,7 +430,7 @@ static void icl_native_gpio_set_value(struct drm_i915_private *dev_priv, break; case MIPI_BKLT_EN_1: case MIPI_BKLT_EN_2: - index = gpio == MIPI_AVDD_EN_1 ? 0 : 1; + index = gpio == MIPI_BKLT_EN_1 ? 0 : 1; intel_de_rmw(dev_priv, PP_CONTROL(index), EDP_BLC_ENABLE, value ? EDP_BLC_ENABLE : 0); -- cgit From a23529989a8f56d23680c4f2d14011bc9c9457c9 Mon Sep 17 00:00:00 2001 From: Nikolaus Voss Date: Tue, 20 Dec 2022 09:17:50 +0100 Subject: crypto: caam - fix CAAM io mem access in blob_gen IO memory access has to be done with accessors defined in caam/regs.h as there are little-endian architectures with a big-endian CAAM unit. Fixes: 6a83830f649a ("crypto: caam - warn if blob_gen key is insecure") Signed-off-by: Nikolaus Voss Reviewed-by: Ahmad Fatoum Signed-off-by: Herbert Xu --- drivers/crypto/caam/blob_gen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/caam/blob_gen.c b/drivers/crypto/caam/blob_gen.c index 1f65df489847..f46b161d2cda 100644 --- a/drivers/crypto/caam/blob_gen.c +++ b/drivers/crypto/caam/blob_gen.c @@ -104,7 +104,7 @@ int caam_process_blob(struct caam_blob_priv *priv, } ctrlpriv = dev_get_drvdata(jrdev->parent); - moo = FIELD_GET(CSTA_MOO, ioread32(&ctrlpriv->ctrl->perfmon.status)); + moo = FIELD_GET(CSTA_MOO, rd_reg32(&ctrlpriv->ctrl->perfmon.status)); if (moo != CSTA_MOO_SECURE && moo != CSTA_MOO_TRUSTED) dev_warn(jrdev, "using insecure test key, enable HAB to use unique device key!\n"); -- cgit From 736f88689c6912f05d0116917910603a7ba97de7 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Wed, 21 Dec 2022 15:32:32 +0800 Subject: crypto: arm64/sm4 - fix possible crash with CFI enabled The SM4 CCM/GCM assembly functions for encryption and decryption is called via indirect function calls. Therefore they need to use SYM_TYPED_FUNC_START instead of SYM_FUNC_START to cause its type hash to be emitted when the kernel is built with CONFIG_CFI_CLANG=y. Otherwise, the code crashes with a CFI failure (if the compiler didn't happen to optimize out the indirect call). Fixes: 67fa3a7fdf80 ("crypto: arm64/sm4 - add CE implementation for CCM mode") Fixes: ae1b83c7d572 ("crypto: arm64/sm4 - add CE implementation for GCM mode") Signed-off-by: Tianjia Zhang Signed-off-by: Herbert Xu --- arch/arm64/crypto/sm4-ce-ccm-core.S | 5 +++-- arch/arm64/crypto/sm4-ce-gcm-core.S | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/crypto/sm4-ce-ccm-core.S b/arch/arm64/crypto/sm4-ce-ccm-core.S index 028207c4afd0..fa85856f33ce 100644 --- a/arch/arm64/crypto/sm4-ce-ccm-core.S +++ b/arch/arm64/crypto/sm4-ce-ccm-core.S @@ -8,6 +8,7 @@ */ #include +#include #include #include "sm4-ce-asm.h" @@ -104,7 +105,7 @@ SYM_FUNC_START(sm4_ce_ccm_final) SYM_FUNC_END(sm4_ce_ccm_final) .align 3 -SYM_FUNC_START(sm4_ce_ccm_enc) +SYM_TYPED_FUNC_START(sm4_ce_ccm_enc) /* input: * x0: round key array, CTX * x1: dst @@ -216,7 +217,7 @@ SYM_FUNC_START(sm4_ce_ccm_enc) SYM_FUNC_END(sm4_ce_ccm_enc) .align 3 -SYM_FUNC_START(sm4_ce_ccm_dec) +SYM_TYPED_FUNC_START(sm4_ce_ccm_dec) /* input: * x0: round key array, CTX * x1: dst diff --git a/arch/arm64/crypto/sm4-ce-gcm-core.S b/arch/arm64/crypto/sm4-ce-gcm-core.S index 7aa3ec18a289..347f25d75727 100644 --- a/arch/arm64/crypto/sm4-ce-gcm-core.S +++ b/arch/arm64/crypto/sm4-ce-gcm-core.S @@ -9,6 +9,7 @@ */ #include +#include #include #include "sm4-ce-asm.h" @@ -370,7 +371,7 @@ SYM_FUNC_START(pmull_ghash_update) SYM_FUNC_END(pmull_ghash_update) .align 3 -SYM_FUNC_START(sm4_ce_pmull_gcm_enc) +SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc) /* input: * x0: round key array, CTX * x1: dst @@ -581,7 +582,7 @@ SYM_FUNC_END(sm4_ce_pmull_gcm_enc) #define RH3 v20 .align 3 -SYM_FUNC_START(sm4_ce_pmull_gcm_dec) +SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec) /* input: * x0: round key array, CTX * x1: dst -- cgit From ba2dc1cb5491712a6946d0595cf11ba463f50e64 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 29 Dec 2022 17:45:01 +0100 Subject: gpiolib: Fix using uninitialized lookup-flags on ACPI platforms Commit 8eb1f71e7acc ("gpiolib: consolidate GPIO lookups") refactors fwnode_get_named_gpiod() and gpiod_get_index() into a unified gpiod_find_and_request() helper. The old functions both initialized their local lookupflags variable to GPIO_LOOKUP_FLAGS_DEFAULT, but the new code leaves it uninitialized. This is a problem for at least ACPI platforms, where acpi_find_gpio() only does a bunch of *lookupflags |= GPIO_* statements and thus relies on the variable being initialized. The variable not being initialized leads to: 1. Potentially the wrong flags getting used 2. The check for conflicting lookup flags in gpiod_configure_flags(): "multiple pull-up, pull-down or pull-disable enabled, invalid config" sometimes triggering, making the GPIO unavailable Restore the initialization of lookupflags to GPIO_LOOKUP_FLAGS_DEFAULT to fix this. Fixes: 8eb1f71e7acc ("gpiolib: consolidate GPIO lookups") Signed-off-by: Hans de Goede Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 5a66d9616d7c..939c776b9488 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3905,8 +3905,8 @@ static struct gpio_desc *gpiod_find_and_request(struct device *consumer, const char *label, bool platform_lookup_allowed) { + unsigned long lookupflags = GPIO_LOOKUP_FLAGS_DEFAULT; struct gpio_desc *desc = ERR_PTR(-ENOENT); - unsigned long lookupflags; int ret; if (!IS_ERR_OR_NULL(fwnode)) -- cgit From 90fee3dd5bfc1b9f4c8c0ba6cd2a35c9d79ca4de Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Sun, 11 Dec 2022 00:05:58 +0200 Subject: gpio: pca953x: avoid to use uninitialized value pinctrl There is a variable pinctrl declared without initializer. And then has the case (switch operation chose the default case) to directly use this uninitialized value, this is not a safe behavior. So here initialize the pinctrl as 0 to avoid this issue. This is reported by Coverity. Fixes: 13c5d4ce8060 ("gpio: pca953x: Add support for PCAL6534") Signed-off-by: Haibo Chen Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index a59d61cd44b2..5299e5bb76d6 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -474,6 +474,9 @@ static u8 pcal6534_recalc_addr(struct pca953x_chip *chip, int reg, int off) case PCAL6524_DEBOUNCE: pinctrl = ((reg & PCAL_PINCTRL_MASK) >> 1) + 0x1c; break; + default: + pinctrl = 0; + break; } return pinctrl + addr + (off / BANK_SZ); -- cgit From 2788938b794633fc1865c805764bed196e01f97e Mon Sep 17 00:00:00 2001 From: Cixi Geng Date: Thu, 29 Dec 2022 22:55:43 +0800 Subject: gpio: eic-sprd: Make the irqchip immutable Remove the irq_chip from pmic_eic structure, use the various calls by defining the statically irq_chip structure. Signed-off-by: Cixi Geng Reviewed-by: Baolin Wang Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-eic-sprd.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/gpio/gpio-eic-sprd.c b/drivers/gpio/gpio-eic-sprd.c index 8d722e026e9c..84352a6f4973 100644 --- a/drivers/gpio/gpio-eic-sprd.c +++ b/drivers/gpio/gpio-eic-sprd.c @@ -91,7 +91,6 @@ enum sprd_eic_type { struct sprd_eic { struct gpio_chip chip; - struct irq_chip intc; void __iomem *base[SPRD_EIC_MAX_BANK]; enum sprd_eic_type type; spinlock_t lock; @@ -255,6 +254,8 @@ static void sprd_eic_irq_mask(struct irq_data *data) default: dev_err(chip->parent, "Unsupported EIC type.\n"); } + + gpiochip_disable_irq(chip, offset); } static void sprd_eic_irq_unmask(struct irq_data *data) @@ -263,6 +264,8 @@ static void sprd_eic_irq_unmask(struct irq_data *data) struct sprd_eic *sprd_eic = gpiochip_get_data(chip); u32 offset = irqd_to_hwirq(data); + gpiochip_enable_irq(chip, offset); + switch (sprd_eic->type) { case SPRD_EIC_DEBOUNCE: sprd_eic_update(chip, offset, SPRD_EIC_DBNC_IE, 1); @@ -564,6 +567,15 @@ static void sprd_eic_irq_handler(struct irq_desc *desc) chained_irq_exit(ic, desc); } +static const struct irq_chip sprd_eic_irq = { + .name = "sprd-eic", + .irq_ack = sprd_eic_irq_ack, + .irq_mask = sprd_eic_irq_mask, + .irq_unmask = sprd_eic_irq_unmask, + .irq_set_type = sprd_eic_irq_set_type, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; static int sprd_eic_probe(struct platform_device *pdev) { const struct sprd_eic_variant_data *pdata; @@ -626,15 +638,8 @@ static int sprd_eic_probe(struct platform_device *pdev) break; } - sprd_eic->intc.name = dev_name(&pdev->dev); - sprd_eic->intc.irq_ack = sprd_eic_irq_ack; - sprd_eic->intc.irq_mask = sprd_eic_irq_mask; - sprd_eic->intc.irq_unmask = sprd_eic_irq_unmask; - sprd_eic->intc.irq_set_type = sprd_eic_irq_set_type; - sprd_eic->intc.flags = IRQCHIP_SKIP_SET_WAKE; - irq = &sprd_eic->chip.irq; - irq->chip = &sprd_eic->intc; + gpio_irq_chip_set_chip(irq, &sprd_eic_irq); irq->handler = handle_bad_irq; irq->default_type = IRQ_TYPE_NONE; irq->parent_handler = sprd_eic_irq_handler; -- cgit From be43eea7de5a3977ac3d13fbfb9e505fab475e97 Mon Sep 17 00:00:00 2001 From: Cixi Geng Date: Thu, 29 Dec 2022 22:55:44 +0800 Subject: gpio: pmic-eic-sprd: Make the irqchip immutable Remove the irq_chip from pmic_eic structure, use the various calls by defining the statically irq_chip structure. Signed-off-by: Cixi Geng Reviewed-by: Baolin Wang Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pmic-eic-sprd.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/gpio/gpio-pmic-eic-sprd.c b/drivers/gpio/gpio-pmic-eic-sprd.c index e518490c4b68..c3e4d90f6b18 100644 --- a/drivers/gpio/gpio-pmic-eic-sprd.c +++ b/drivers/gpio/gpio-pmic-eic-sprd.c @@ -47,7 +47,6 @@ enum { /** * struct sprd_pmic_eic - PMIC EIC controller * @chip: the gpio_chip structure. - * @intc: the irq_chip structure. * @map: the regmap from the parent device. * @offset: the EIC controller's offset address of the PMIC. * @reg: the array to cache the EIC registers. @@ -56,7 +55,6 @@ enum { */ struct sprd_pmic_eic { struct gpio_chip chip; - struct irq_chip intc; struct regmap *map; u32 offset; u8 reg[CACHE_NR_REGS]; @@ -151,15 +149,21 @@ static void sprd_pmic_eic_irq_mask(struct irq_data *data) { struct gpio_chip *chip = irq_data_get_irq_chip_data(data); struct sprd_pmic_eic *pmic_eic = gpiochip_get_data(chip); + u32 offset = irqd_to_hwirq(data); pmic_eic->reg[REG_IE] = 0; pmic_eic->reg[REG_TRIG] = 0; + + gpiochip_disable_irq(chip, offset); } static void sprd_pmic_eic_irq_unmask(struct irq_data *data) { struct gpio_chip *chip = irq_data_get_irq_chip_data(data); struct sprd_pmic_eic *pmic_eic = gpiochip_get_data(chip); + u32 offset = irqd_to_hwirq(data); + + gpiochip_enable_irq(chip, offset); pmic_eic->reg[REG_IE] = 1; pmic_eic->reg[REG_TRIG] = 1; @@ -292,6 +296,17 @@ static irqreturn_t sprd_pmic_eic_irq_handler(int irq, void *data) return IRQ_HANDLED; } +static const struct irq_chip pmic_eic_irq_chip = { + .name = "sprd-pmic-eic", + .irq_mask = sprd_pmic_eic_irq_mask, + .irq_unmask = sprd_pmic_eic_irq_unmask, + .irq_set_type = sprd_pmic_eic_irq_set_type, + .irq_bus_lock = sprd_pmic_eic_bus_lock, + .irq_bus_sync_unlock = sprd_pmic_eic_bus_sync_unlock, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + static int sprd_pmic_eic_probe(struct platform_device *pdev) { struct gpio_irq_chip *irq; @@ -338,16 +353,8 @@ static int sprd_pmic_eic_probe(struct platform_device *pdev) pmic_eic->chip.set = sprd_pmic_eic_set; pmic_eic->chip.get = sprd_pmic_eic_get; - pmic_eic->intc.name = dev_name(&pdev->dev); - pmic_eic->intc.irq_mask = sprd_pmic_eic_irq_mask; - pmic_eic->intc.irq_unmask = sprd_pmic_eic_irq_unmask; - pmic_eic->intc.irq_set_type = sprd_pmic_eic_irq_set_type; - pmic_eic->intc.irq_bus_lock = sprd_pmic_eic_bus_lock; - pmic_eic->intc.irq_bus_sync_unlock = sprd_pmic_eic_bus_sync_unlock; - pmic_eic->intc.flags = IRQCHIP_SKIP_SET_WAKE; - irq = &pmic_eic->chip.irq; - irq->chip = &pmic_eic->intc; + gpio_irq_chip_set_chip(irq, &pmic_eic_irq_chip); irq->threaded = true; ret = devm_gpiochip_add_data(&pdev->dev, &pmic_eic->chip, pmic_eic); -- cgit From 9883ddf9d68db5332f08dfc7283db69f69f8d6d2 Mon Sep 17 00:00:00 2001 From: Cixi Geng Date: Thu, 29 Dec 2022 22:55:45 +0800 Subject: gpio: sprd: Make the irqchip immutable Make the struct irq_chip const, flag it as IRQCHIP_IMMUTABLE, add the new helper functions, and call the appropriate gpiolib functions. Signed-off-by: Cixi Geng Reported-by: kernel test robot Reported-by: Julia Lawall Reviewed-by: Baolin Wang Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sprd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-sprd.c b/drivers/gpio/gpio-sprd.c index 9bff63990eee..072b4e653216 100644 --- a/drivers/gpio/gpio-sprd.c +++ b/drivers/gpio/gpio-sprd.c @@ -120,6 +120,7 @@ static void sprd_gpio_irq_mask(struct irq_data *data) u32 offset = irqd_to_hwirq(data); sprd_gpio_update(chip, offset, SPRD_GPIO_IE, 0); + gpiochip_disable_irq(chip, offset); } static void sprd_gpio_irq_ack(struct irq_data *data) @@ -136,6 +137,7 @@ static void sprd_gpio_irq_unmask(struct irq_data *data) u32 offset = irqd_to_hwirq(data); sprd_gpio_update(chip, offset, SPRD_GPIO_IE, 1); + gpiochip_enable_irq(chip, offset); } static int sprd_gpio_irq_set_type(struct irq_data *data, @@ -205,13 +207,14 @@ static void sprd_gpio_irq_handler(struct irq_desc *desc) chained_irq_exit(ic, desc); } -static struct irq_chip sprd_gpio_irqchip = { +static const struct irq_chip sprd_gpio_irqchip = { .name = "sprd-gpio", .irq_ack = sprd_gpio_irq_ack, .irq_mask = sprd_gpio_irq_mask, .irq_unmask = sprd_gpio_irq_unmask, .irq_set_type = sprd_gpio_irq_set_type, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, }; static int sprd_gpio_probe(struct platform_device *pdev) @@ -245,7 +248,7 @@ static int sprd_gpio_probe(struct platform_device *pdev) sprd_gpio->chip.direction_output = sprd_gpio_direction_output; irq = &sprd_gpio->chip.irq; - irq->chip = &sprd_gpio_irqchip; + gpio_irq_chip_set_chip(irq, &sprd_gpio_irqchip); irq->handler = handle_bad_irq; irq->default_type = IRQ_TYPE_NONE; irq->parent_handler = sprd_gpio_irq_handler; -- cgit From b878d3ba9bb41cddb73ba4b56e5552f0a638daca Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 27 Dec 2022 16:10:05 -0800 Subject: thermal: int340x: Add missing attribute for data rate base Commit 473be51142ad ("thermal: int340x: processor_thermal: Add RFIM driver")' added rfi_restriction_data_rate_base string, mmio details and documentation, but missed adding attribute to sysfs. Add missing sysfs attribute. Fixes: 473be51142ad ("thermal: int340x: processor_thermal: Add RFIM driver") Cc: 5.11+ # v5.11+ Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c index 8c42e7662033..92ed1213fe37 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c @@ -172,6 +172,7 @@ static const struct attribute_group fivr_attribute_group = { RFIM_SHOW(rfi_restriction_run_busy, 1) RFIM_SHOW(rfi_restriction_err_code, 1) RFIM_SHOW(rfi_restriction_data_rate, 1) +RFIM_SHOW(rfi_restriction_data_rate_base, 1) RFIM_SHOW(ddr_data_rate_point_0, 1) RFIM_SHOW(ddr_data_rate_point_1, 1) RFIM_SHOW(ddr_data_rate_point_2, 1) @@ -181,11 +182,13 @@ RFIM_SHOW(rfi_disable, 1) RFIM_STORE(rfi_restriction_run_busy, 1) RFIM_STORE(rfi_restriction_err_code, 1) RFIM_STORE(rfi_restriction_data_rate, 1) +RFIM_STORE(rfi_restriction_data_rate_base, 1) RFIM_STORE(rfi_disable, 1) static DEVICE_ATTR_RW(rfi_restriction_run_busy); static DEVICE_ATTR_RW(rfi_restriction_err_code); static DEVICE_ATTR_RW(rfi_restriction_data_rate); +static DEVICE_ATTR_RW(rfi_restriction_data_rate_base); static DEVICE_ATTR_RO(ddr_data_rate_point_0); static DEVICE_ATTR_RO(ddr_data_rate_point_1); static DEVICE_ATTR_RO(ddr_data_rate_point_2); @@ -248,6 +251,7 @@ static struct attribute *dvfs_attrs[] = { &dev_attr_rfi_restriction_run_busy.attr, &dev_attr_rfi_restriction_err_code.attr, &dev_attr_rfi_restriction_data_rate.attr, + &dev_attr_rfi_restriction_data_rate_base.attr, &dev_attr_ddr_data_rate_point_0.attr, &dev_attr_ddr_data_rate_point_1.attr, &dev_attr_ddr_data_rate_point_2.attr, -- cgit From 38b50aa44495d5eb4218f0b82fc2da76505cec53 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 28 Dec 2022 14:56:09 +0200 Subject: RDMA/mlx5: Fix mlx5_ib_get_hw_stats when used for device Currently, when mlx5_ib_get_hw_stats() is used for device (port_num = 0), there is a special handling in order to use the correct counters, but, port_num is being passed down the stack without any change. Also, some functions assume that port_num >=1. As a result, the following oops can occur. BUG: unable to handle page fault for address: ffff89510294f1a8 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP CPU: 8 PID: 1382 Comm: devlink Tainted: G W 6.1.0-rc4_for_upstream_base_2022_11_10_16_12 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:_raw_spin_lock+0xc/0x20 Call Trace: mlx5_ib_get_native_port_mdev+0x73/0xe0 [mlx5_ib] do_get_hw_stats.constprop.0+0x109/0x160 [mlx5_ib] mlx5_ib_get_hw_stats+0xad/0x180 [mlx5_ib] ib_setup_device_attrs+0xf0/0x290 [ib_core] ib_register_device+0x3bb/0x510 [ib_core] ? atomic_notifier_chain_register+0x67/0x80 __mlx5_ib_add+0x2b/0x80 [mlx5_ib] mlx5r_probe+0xb8/0x150 [mlx5_ib] ? auxiliary_match_id+0x6a/0x90 auxiliary_bus_probe+0x3c/0x70 ? driver_sysfs_add+0x6b/0x90 really_probe+0xcd/0x380 __driver_probe_device+0x80/0x170 driver_probe_device+0x1e/0x90 __device_attach_driver+0x7d/0x100 ? driver_allows_async_probing+0x60/0x60 ? driver_allows_async_probing+0x60/0x60 bus_for_each_drv+0x7b/0xc0 __device_attach+0xbc/0x200 bus_probe_device+0x87/0xa0 device_add+0x404/0x940 ? dev_set_name+0x53/0x70 __auxiliary_device_add+0x43/0x60 add_adev+0x99/0xe0 [mlx5_core] mlx5_attach_device+0xc8/0x120 [mlx5_core] mlx5_load_one_devl_locked+0xb2/0xe0 [mlx5_core] devlink_reload+0x133/0x250 devlink_nl_cmd_reload+0x480/0x570 ? devlink_nl_pre_doit+0x44/0x2b0 genl_family_rcv_msg_doit.isra.0+0xc2/0x110 genl_rcv_msg+0x180/0x2b0 ? devlink_nl_cmd_region_read_dumpit+0x540/0x540 ? devlink_reload+0x250/0x250 ? devlink_put+0x50/0x50 ? genl_family_rcv_msg_doit.isra.0+0x110/0x110 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1f6/0x2c0 netlink_sendmsg+0x237/0x490 sock_sendmsg+0x33/0x40 __sys_sendto+0x103/0x160 ? handle_mm_fault+0x10e/0x290 ? do_user_addr_fault+0x1c0/0x5f0 __x64_sys_sendto+0x25/0x30 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fix it by setting port_num to 1 in order to get device status and remove unused variable. Fixes: aac4492ef23a ("IB/mlx5: Update counter implementation for dual port RoCE") Link: https://lore.kernel.org/r/98b82994c3cd3fa593b8a75ed3f3901e208beb0f.1672231736.git.leonro@nvidia.com Signed-off-by: Shay Drory Reviewed-by: Patrisious Haddad Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 945758f39523..3e1272695d99 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -278,7 +278,6 @@ static int do_get_hw_stats(struct ib_device *ibdev, const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1); struct mlx5_core_dev *mdev; int ret, num_counters; - u32 mdev_port_num; if (!stats) return -EINVAL; @@ -299,8 +298,9 @@ static int do_get_hw_stats(struct ib_device *ibdev, } if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { - mdev = mlx5_ib_get_native_port_mdev(dev, port_num, - &mdev_port_num); + if (!port_num) + port_num = 1; + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); if (!mdev) { /* If port is not affiliated yet, its in down state * which doesn't have any counters yet, so it would be -- cgit From 8de8482fe5732fbef4f5af82bc0c0362c804cd1f Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 28 Dec 2022 14:56:10 +0200 Subject: RDMA/mlx5: Fix validation of max_rd_atomic caps for DC Currently, when modifying DC, we validate max_rd_atomic user attribute against the RC cap, validate against DC. RC and DC QP types have different device limitations. This can cause userspace created DC QPs to malfunction. Fixes: c32a4f296e1d ("IB/mlx5: Add support for DC Initiator QP") Link: https://lore.kernel.org/r/0c5aee72cea188c3bb770f4207cce7abc9b6fc74.1672231736.git.leonro@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 49 +++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 40d9410ec303..cf953d23d18d 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4502,6 +4502,40 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev, return false; } +static int validate_rd_atomic(struct mlx5_ib_dev *dev, struct ib_qp_attr *attr, + int attr_mask, enum ib_qp_type qp_type) +{ + int log_max_ra_res; + int log_max_ra_req; + + if (qp_type == MLX5_IB_QPT_DCI) { + log_max_ra_res = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_res_dc); + log_max_ra_req = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_req_dc); + } else { + log_max_ra_res = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_res_qp); + log_max_ra_req = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_req_qp); + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > log_max_ra_res) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); + return false; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > log_max_ra_req) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); + return false; + } + return true; +} + int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -4589,21 +4623,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && - attr->max_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { - mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", - attr->max_rd_atomic); - goto out; - } - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && - attr->max_dest_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { - mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", - attr->max_dest_rd_atomic); + if (!validate_rd_atomic(dev, attr, attr_mask, qp_type)) goto out; - } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; -- cgit From 9807ae69746196ee4bbffe7d22d22ab2b61c6ed0 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 29 Dec 2022 17:33:32 +0100 Subject: net: dsa: qca8k: fix wrong length value for mgmt eth packet The assumption that Documentation was right about how this value work was wrong. It was discovered that the length value of the mgmt header is in step of word size. As an example to process 4 byte of data the correct length to set is 2. To process 8 byte 4, 12 byte 6, 16 byte 8... Odd values will always return the next size on the ack packet. (length of 3 (6 byte) will always return 8 bytes of data) This means that a value of 15 (0xf) actually means reading/writing 32 bytes of data instead of 16 bytes. This behaviour is totally absent and not documented in the switch Documentation. In fact from Documentation the max value that mgmt eth can process is 16 byte of data while in reality it can process 32 bytes at once. To handle this we always round up the length after deviding it for word size. We check if the result is odd and we round another time to align to what the switch will provide in the ack packet. The workaround for the length limit of 15 is still needed as the length reg max value is 0xf(15) Reported-by: Ronald Wahl Tested-by: Ronald Wahl Fixes: 90386223f44e ("net: dsa: qca8k: add support for larger read/write size with mgmt Ethernet") Signed-off-by: Christian Marangi Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 45 +++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index c5c3b4e92f28..46151320b2a8 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -146,7 +146,16 @@ static void qca8k_rw_reg_ack_handler(struct dsa_switch *ds, struct sk_buff *skb) command = get_unaligned_le32(&mgmt_ethhdr->command); cmd = FIELD_GET(QCA_HDR_MGMT_CMD, command); + len = FIELD_GET(QCA_HDR_MGMT_LENGTH, command); + /* Special case for len of 15 as this is the max value for len and needs to + * be increased before converting it from word to dword. + */ + if (len == 15) + len++; + + /* We can ignore odd value, we always round up them in the alloc function. */ + len *= sizeof(u16); /* Make sure the seq match the requested packet */ if (get_unaligned_le32(&mgmt_ethhdr->seq) == mgmt_eth_data->seq) @@ -193,17 +202,33 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * if (!skb) return NULL; - /* Max value for len reg is 15 (0xf) but the switch actually return 16 byte - * Actually for some reason the steps are: - * 0: nothing - * 1-4: first 4 byte - * 5-6: first 12 byte - * 7-15: all 16 byte + /* Hdr mgmt length value is in step of word size. + * As an example to process 4 byte of data the correct length to set is 2. + * To process 8 byte 4, 12 byte 6, 16 byte 8... + * + * Odd values will always return the next size on the ack packet. + * (length of 3 (6 byte) will always return 8 bytes of data) + * + * This means that a value of 15 (0xf) actually means reading/writing 32 bytes + * of data. + * + * To correctly calculate the length we devide the requested len by word and + * round up. + * On the ack function we can skip the odd check as we already handle the + * case here. + */ + real_len = DIV_ROUND_UP(len, sizeof(u16)); + + /* We check if the result len is odd and we round up another time to + * the next size. (length of 3 will be increased to 4 as switch will always + * return 8 bytes) */ - if (len == 16) - real_len = 15; - else - real_len = len; + if (real_len % sizeof(u16) != 0) + real_len++; + + /* Max reg value is 0xf(15) but switch will always return the next size (32 byte) */ + if (real_len == 16) + real_len--; skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); -- cgit From d9dba91be71f03cc75bcf39fc0d5d99ff33f1ae0 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 29 Dec 2022 17:33:33 +0100 Subject: net: dsa: tag_qca: fix wrong MGMT_DATA2 size It was discovered that MGMT_DATA2 can contain up to 28 bytes of data instead of the 12 bytes written in the Documentation by accounting the limit of 16 bytes declared in Documentation subtracting the first 4 byte in the packet header. Update the define with the real world value. Tested-by: Ronald Wahl Fixes: c2ee8181fddb ("net: dsa: tag_qca: add define for handling mgmt Ethernet packet") Signed-off-by: Christian Marangi Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index b1b5720d89a5..ee657452f122 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -45,8 +45,8 @@ struct sk_buff; QCA_HDR_MGMT_COMMAND_LEN + \ QCA_HDR_MGMT_DATA1_LEN) -#define QCA_HDR_MGMT_DATA2_LEN 12 /* Other 12 byte for the mdio data */ -#define QCA_HDR_MGMT_PADDING_LEN 34 /* Padding to reach the min Ethernet packet */ +#define QCA_HDR_MGMT_DATA2_LEN 28 /* Other 28 byte for the mdio data */ +#define QCA_HDR_MGMT_PADDING_LEN 18 /* Padding to reach the min Ethernet packet */ #define QCA_HDR_MGMT_PKT_LEN (QCA_HDR_MGMT_HEADER_LEN + \ QCA_HDR_LEN + \ -- cgit From 03cb9e6d0b32b768e3d9d473c5c4ca1100877664 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 29 Dec 2022 17:33:34 +0100 Subject: Revert "net: dsa: qca8k: cache lo and hi for mdio write" This reverts commit 2481d206fae7884cd07014fd1318e63af35e99eb. The Documentation is very confusing about the topic. The cache logic for hi and lo is wrong and actually miss some regs to be actually written. What the Documentation actually intended was that it's possible to skip writing hi OR lo if half of the reg is not needed to be written or read. Revert the change in favor of a better and correct implementation. Reported-by: Ronald Wahl Signed-off-by: Christian Marangi Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 61 ++++++++-------------------------------- drivers/net/dsa/qca/qca8k.h | 5 ---- 2 files changed, 12 insertions(+), 54 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 46151320b2a8..fbcd5c2b13ae 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -36,44 +36,6 @@ qca8k_split_addr(u32 regaddr, u16 *r1, u16 *r2, u16 *page) *page = regaddr & 0x3ff; } -static int -qca8k_set_lo(struct qca8k_priv *priv, int phy_id, u32 regnum, u16 lo) -{ - u16 *cached_lo = &priv->mdio_cache.lo; - struct mii_bus *bus = priv->bus; - int ret; - - if (lo == *cached_lo) - return 0; - - ret = bus->write(bus, phy_id, regnum, lo); - if (ret < 0) - dev_err_ratelimited(&bus->dev, - "failed to write qca8k 32bit lo register\n"); - - *cached_lo = lo; - return 0; -} - -static int -qca8k_set_hi(struct qca8k_priv *priv, int phy_id, u32 regnum, u16 hi) -{ - u16 *cached_hi = &priv->mdio_cache.hi; - struct mii_bus *bus = priv->bus; - int ret; - - if (hi == *cached_hi) - return 0; - - ret = bus->write(bus, phy_id, regnum, hi); - if (ret < 0) - dev_err_ratelimited(&bus->dev, - "failed to write qca8k 32bit hi register\n"); - - *cached_hi = hi; - return 0; -} - static int qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val) { @@ -97,7 +59,7 @@ qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val) } static void -qca8k_mii_write32(struct qca8k_priv *priv, int phy_id, u32 regnum, u32 val) +qca8k_mii_write32(struct mii_bus *bus, int phy_id, u32 regnum, u32 val) { u16 lo, hi; int ret; @@ -105,9 +67,12 @@ qca8k_mii_write32(struct qca8k_priv *priv, int phy_id, u32 regnum, u32 val) lo = val & 0xffff; hi = (u16)(val >> 16); - ret = qca8k_set_lo(priv, phy_id, regnum, lo); + ret = bus->write(bus, phy_id, regnum, lo); if (ret >= 0) - ret = qca8k_set_hi(priv, phy_id, regnum + 1, hi); + ret = bus->write(bus, phy_id, regnum + 1, hi); + if (ret < 0) + dev_err_ratelimited(&bus->dev, + "failed to write qca8k 32bit register\n"); } static int @@ -442,7 +407,7 @@ qca8k_regmap_write(void *ctx, uint32_t reg, uint32_t val) if (ret < 0) goto exit; - qca8k_mii_write32(priv, 0x10 | r2, r1, val); + qca8k_mii_write32(bus, 0x10 | r2, r1, val); exit: mutex_unlock(&bus->mdio_lock); @@ -475,7 +440,7 @@ qca8k_regmap_update_bits(void *ctx, uint32_t reg, uint32_t mask, uint32_t write_ val &= ~mask; val |= write_val; - qca8k_mii_write32(priv, 0x10 | r2, r1, val); + qca8k_mii_write32(bus, 0x10 | r2, r1, val); exit: mutex_unlock(&bus->mdio_lock); @@ -750,14 +715,14 @@ qca8k_mdio_write(struct qca8k_priv *priv, int phy, int regnum, u16 data) if (ret) goto exit; - qca8k_mii_write32(priv, 0x10 | r2, r1, val); + qca8k_mii_write32(bus, 0x10 | r2, r1, val); ret = qca8k_mdio_busy_wait(bus, QCA8K_MDIO_MASTER_CTRL, QCA8K_MDIO_MASTER_BUSY); exit: /* even if the busy_wait timeouts try to clear the MASTER_EN */ - qca8k_mii_write32(priv, 0x10 | r2, r1, 0); + qca8k_mii_write32(bus, 0x10 | r2, r1, 0); mutex_unlock(&bus->mdio_lock); @@ -787,7 +752,7 @@ qca8k_mdio_read(struct qca8k_priv *priv, int phy, int regnum) if (ret) goto exit; - qca8k_mii_write32(priv, 0x10 | r2, r1, val); + qca8k_mii_write32(bus, 0x10 | r2, r1, val); ret = qca8k_mdio_busy_wait(bus, QCA8K_MDIO_MASTER_CTRL, QCA8K_MDIO_MASTER_BUSY); @@ -798,7 +763,7 @@ qca8k_mdio_read(struct qca8k_priv *priv, int phy, int regnum) exit: /* even if the busy_wait timeouts try to clear the MASTER_EN */ - qca8k_mii_write32(priv, 0x10 | r2, r1, 0); + qca8k_mii_write32(bus, 0x10 | r2, r1, 0); mutex_unlock(&bus->mdio_lock); @@ -1968,8 +1933,6 @@ qca8k_sw_probe(struct mdio_device *mdiodev) } priv->mdio_cache.page = 0xffff; - priv->mdio_cache.lo = 0xffff; - priv->mdio_cache.hi = 0xffff; /* Check the detected switch id */ ret = qca8k_read_switch_id(priv); diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h index 0b7a5cb12321..03514f7a20be 100644 --- a/drivers/net/dsa/qca/qca8k.h +++ b/drivers/net/dsa/qca/qca8k.h @@ -375,11 +375,6 @@ struct qca8k_mdio_cache { * mdio writes */ u16 page; -/* lo and hi can also be cached and from Documentation we can skip one - * extra mdio write if lo or hi is didn't change. - */ - u16 lo; - u16 hi; }; struct qca8k_pcs { -- cgit From cfbd6de588ef659c198083205dc954a6d3ed2aec Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 29 Dec 2022 17:33:35 +0100 Subject: net: dsa: qca8k: introduce single mii read/write lo/hi It may be useful to read/write just the lo or hi half of a reg. This is especially useful for phy poll with the use of mdio master. The mdio master reg is composed by the first 16 bit related to setup and the other half with the returned data or data to write. Refactor the mii function to permit single mii read/write of lo or hi half of the reg. Tested-by: Ronald Wahl Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 106 +++++++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 22 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index fbcd5c2b13ae..92c4bfef7c97 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -37,42 +37,104 @@ qca8k_split_addr(u32 regaddr, u16 *r1, u16 *r2, u16 *page) } static int -qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val) +qca8k_mii_write_lo(struct mii_bus *bus, int phy_id, u32 regnum, u32 val) { int ret; + u16 lo; - ret = bus->read(bus, phy_id, regnum); - if (ret >= 0) { - *val = ret; - ret = bus->read(bus, phy_id, regnum + 1); - *val |= ret << 16; - } + lo = val & 0xffff; + ret = bus->write(bus, phy_id, regnum, lo); + if (ret < 0) + dev_err_ratelimited(&bus->dev, + "failed to write qca8k 32bit lo register\n"); - if (ret < 0) { + return ret; +} + +static int +qca8k_mii_write_hi(struct mii_bus *bus, int phy_id, u32 regnum, u32 val) +{ + int ret; + u16 hi; + + hi = (u16)(val >> 16); + ret = bus->write(bus, phy_id, regnum, hi); + if (ret < 0) dev_err_ratelimited(&bus->dev, - "failed to read qca8k 32bit register\n"); - *val = 0; - return ret; - } + "failed to write qca8k 32bit hi register\n"); + + return ret; +} + +static int +qca8k_mii_read_lo(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val) +{ + int ret; + + ret = bus->read(bus, phy_id, regnum); + if (ret < 0) + goto err; + *val = ret & 0xffff; return 0; + +err: + dev_err_ratelimited(&bus->dev, + "failed to read qca8k 32bit lo register\n"); + *val = 0; + + return ret; } -static void -qca8k_mii_write32(struct mii_bus *bus, int phy_id, u32 regnum, u32 val) +static int +qca8k_mii_read_hi(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val) { - u16 lo, hi; int ret; - lo = val & 0xffff; - hi = (u16)(val >> 16); + ret = bus->read(bus, phy_id, regnum); + if (ret < 0) + goto err; - ret = bus->write(bus, phy_id, regnum, lo); - if (ret >= 0) - ret = bus->write(bus, phy_id, regnum + 1, hi); + *val = ret << 16; + return 0; + +err: + dev_err_ratelimited(&bus->dev, + "failed to read qca8k 32bit hi register\n"); + *val = 0; + + return ret; +} + +static int +qca8k_mii_read32(struct mii_bus *bus, int phy_id, u32 regnum, u32 *val) +{ + u32 hi, lo; + int ret; + + *val = 0; + + ret = qca8k_mii_read_lo(bus, phy_id, regnum, &lo); if (ret < 0) - dev_err_ratelimited(&bus->dev, - "failed to write qca8k 32bit register\n"); + goto err; + + ret = qca8k_mii_read_hi(bus, phy_id, regnum + 1, &hi); + if (ret < 0) + goto err; + + *val = lo | hi; + +err: + return ret; +} + +static void +qca8k_mii_write32(struct mii_bus *bus, int phy_id, u32 regnum, u32 val) +{ + if (qca8k_mii_write_lo(bus, phy_id, regnum, val) < 0) + return; + + qca8k_mii_write_hi(bus, phy_id, regnum + 1, val); } static int -- cgit From a4165830ca237f2b3318faf62562bce8ce12a389 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 29 Dec 2022 17:33:36 +0100 Subject: net: dsa: qca8k: improve mdio master read/write by using single lo/hi Improve mdio master read/write by using singe mii read/write lo/hi. In a read and write we need to poll the mdio master regs in a busy loop to check for a specific bit present in the upper half of the reg. We can ignore the other half since it won't contain useful data. This will save an additional useless read for each read and write operation. In a read operation the returned data is present in the mdio master reg lower half. We can ignore the other half since it won't contain useful data. This will save an additional useless read for each read operation. In a read operation it's needed to just set the hi half of the mdio master reg as the lo half will be replaced by the result. This will save an additional useless write for each read operation. Tested-by: Ronald Wahl Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 92c4bfef7c97..2f224b166bbb 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -740,9 +740,9 @@ qca8k_mdio_busy_wait(struct mii_bus *bus, u32 reg, u32 mask) qca8k_split_addr(reg, &r1, &r2, &page); - ret = read_poll_timeout(qca8k_mii_read32, ret1, !(val & mask), 0, + ret = read_poll_timeout(qca8k_mii_read_hi, ret1, !(val & mask), 0, QCA8K_BUSY_WAIT_TIMEOUT * USEC_PER_MSEC, false, - bus, 0x10 | r2, r1, &val); + bus, 0x10 | r2, r1 + 1, &val); /* Check if qca8k_read has failed for a different reason * before returnting -ETIMEDOUT @@ -784,7 +784,7 @@ qca8k_mdio_write(struct qca8k_priv *priv, int phy, int regnum, u16 data) exit: /* even if the busy_wait timeouts try to clear the MASTER_EN */ - qca8k_mii_write32(bus, 0x10 | r2, r1, 0); + qca8k_mii_write_hi(bus, 0x10 | r2, r1 + 1, 0); mutex_unlock(&bus->mdio_lock); @@ -814,18 +814,18 @@ qca8k_mdio_read(struct qca8k_priv *priv, int phy, int regnum) if (ret) goto exit; - qca8k_mii_write32(bus, 0x10 | r2, r1, val); + qca8k_mii_write_hi(bus, 0x10 | r2, r1 + 1, val); ret = qca8k_mdio_busy_wait(bus, QCA8K_MDIO_MASTER_CTRL, QCA8K_MDIO_MASTER_BUSY); if (ret) goto exit; - ret = qca8k_mii_read32(bus, 0x10 | r2, r1, &val); + ret = qca8k_mii_read_lo(bus, 0x10 | r2, r1, &val); exit: /* even if the busy_wait timeouts try to clear the MASTER_EN */ - qca8k_mii_write32(bus, 0x10 | r2, r1, 0); + qca8k_mii_write_hi(bus, 0x10 | r2, r1 + 1, 0); mutex_unlock(&bus->mdio_lock); -- cgit From 6d4cfcf97986cc67635630a2bc1f8d5c92ecdbba Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 29 Dec 2022 15:21:20 -0500 Subject: net: phy: Update documentation for get_rate_matching Now that phylink no longer calls phy_get_rate_matching with PHY_INTERFACE_MODE_NA, phys no longer need to support it. Remove the documentation mandating support. Fixes: 7642cc28fd37 ("net: phylink: fix PHY validation with rate adaption") Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phy.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/phy.h b/include/linux/phy.h index 71eeb4e3b1fd..6378c997ded5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -826,10 +826,7 @@ struct phy_driver { * whether to advertise lower-speed modes for that interface. It is * assumed that if a rate matching mode is supported on an interface, * then that interface's rate can be adapted to all slower link speeds - * supported by the phy. If iface is %PHY_INTERFACE_MODE_NA, and the phy - * supports any kind of rate matching for any interface, then it must - * return that rate matching mode (preferring %RATE_MATCH_PAUSE to - * %RATE_MATCH_CRS). If the interface is not supported, this should + * supported by the phy. If the interface is not supported, this should * return %RATE_MATCH_NONE. */ int (*get_rate_matching)(struct phy_device *phydev, -- cgit From 9c4d7f45d60745a1cea0e841fa5e3444c398d2f1 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Fri, 30 Dec 2022 17:18:28 +0800 Subject: selftests: net: fix cleanup_v6() for arp_ndisc_evict_nocarrier The cleanup_v6() will cause the arp_ndisc_evict_nocarrier script exit with 255 (No such file or directory), even the tests are good: # selftests: net: arp_ndisc_evict_nocarrier.sh # run arp_evict_nocarrier=1 test # RTNETLINK answers: File exists # ok # run arp_evict_nocarrier=0 test # RTNETLINK answers: File exists # ok # run all.arp_evict_nocarrier=0 test # RTNETLINK answers: File exists # ok # run ndisc_evict_nocarrier=1 test # ok # run ndisc_evict_nocarrier=0 test # ok # run all.ndisc_evict_nocarrier=0 test # ok not ok 1 selftests: net: arp_ndisc_evict_nocarrier.sh # exit=255 This is because it's trying to modify the parameter for ipv4 instead. Also, tests for ipv6 (run_ndisc_evict_nocarrier_enabled() and run_ndisc_evict_nocarrier_disabled() are working on veth1, reflect this fact in cleanup_v6(). Fixes: f86ca07eb531 ("selftests: net: add arp_ndisc_evict_nocarrier") Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh index b5af08af8559..b4ec1eeee6c9 100755 --- a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh +++ b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh @@ -24,8 +24,8 @@ cleanup_v6() ip netns del me ip netns del peer - sysctl -w net.ipv4.conf.veth0.ndisc_evict_nocarrier=1 >/dev/null 2>&1 - sysctl -w net.ipv4.conf.all.ndisc_evict_nocarrier=1 >/dev/null 2>&1 + sysctl -w net.ipv6.conf.veth1.ndisc_evict_nocarrier=1 >/dev/null 2>&1 + sysctl -w net.ipv6.conf.all.ndisc_evict_nocarrier=1 >/dev/null 2>&1 } create_ns() -- cgit From 1856628baa17032531916984808d1bdfd62700d4 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Fri, 30 Dec 2022 17:18:29 +0800 Subject: selftests: net: return non-zero for failures reported in arp_ndisc_evict_nocarrier Return non-zero return value if there is any failure reported in this script during the test. Otherwise it can only reflect the status of the last command. Fixes: f86ca07eb531 ("selftests: net: add arp_ndisc_evict_nocarrier") Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh index b4ec1eeee6c9..4a110bb01e53 100755 --- a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh +++ b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh @@ -18,6 +18,7 @@ readonly V4_ADDR1=10.0.10.2 readonly V6_ADDR0=2001:db8:91::1 readonly V6_ADDR1=2001:db8:91::2 nsid=100 +ret=0 cleanup_v6() { @@ -61,7 +62,7 @@ setup_v6() { if [ $? -ne 0 ]; then cleanup_v6 echo "failed" - exit + exit 1 fi # Set veth2 down, which will put veth1 in NOCARRIER state @@ -88,7 +89,7 @@ setup_v4() { if [ $? -ne 0 ]; then cleanup_v4 echo "failed" - exit + exit 1 fi # Set veth1 down, which will put veth0 in NOCARRIER state @@ -115,6 +116,7 @@ run_arp_evict_nocarrier_enabled() { if [ $? -eq 0 ];then echo "failed" + ret=1 else echo "ok" fi @@ -134,6 +136,7 @@ run_arp_evict_nocarrier_disabled() { echo "ok" else echo "failed" + ret=1 fi cleanup_v4 @@ -164,6 +167,7 @@ run_ndisc_evict_nocarrier_enabled() { if [ $? -eq 0 ];then echo "failed" + ret=1 else echo "ok" fi @@ -182,6 +186,7 @@ run_ndisc_evict_nocarrier_disabled() { echo "ok" else echo "failed" + ret=1 fi cleanup_v6 @@ -198,6 +203,7 @@ run_ndisc_evict_nocarrier_disabled_all() { echo "ok" else echo "failed" + ret=1 fi cleanup_v6 @@ -218,3 +224,4 @@ if [ "$(id -u)" -ne 0 ];then fi run_all_tests +exit $ret -- cgit From d9d71a89f28d27ac772c429b37d52668d011df7a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Dec 2022 16:33:04 -0600 Subject: net: ipa: use proper endpoint mask for suspend It is now possible for a system to have more than 32 endpoints. As a result, registers related to endpoint suspend are parameterized, with 32 endpoints represented in one more registers. In ipa_interrupt_suspend_control(), the IPA_SUSPEND_EN register offset is determined properly, but the bit mask used still assumes the number of enpoints won't exceed 32. This is a bug. Fix it. Fixes: f298ba785e2d ("net: ipa: add a parameter to suspend registers") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_interrupt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ipa/ipa_interrupt.c b/drivers/net/ipa/ipa_interrupt.c index a49f66efacb8..d458a35839cc 100644 --- a/drivers/net/ipa/ipa_interrupt.c +++ b/drivers/net/ipa/ipa_interrupt.c @@ -132,10 +132,10 @@ static void ipa_interrupt_suspend_control(struct ipa_interrupt *interrupt, u32 endpoint_id, bool enable) { struct ipa *ipa = interrupt->ipa; + u32 mask = BIT(endpoint_id % 32); u32 unit = endpoint_id / 32; const struct ipa_reg *reg; u32 offset; - u32 mask; u32 val; WARN_ON(!test_bit(endpoint_id, ipa->available)); @@ -148,7 +148,6 @@ static void ipa_interrupt_suspend_control(struct ipa_interrupt *interrupt, offset = ipa_reg_n_offset(reg, unit); val = ioread32(ipa->reg_virt + offset); - mask = BIT(endpoint_id); if (enable) val |= mask; else -- cgit From a3542b0ccd58f9fd42f34afa9daea435279a7c1c Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 31 Dec 2022 16:05:46 -0600 Subject: dt-bindings: net: sun8i-emac: Add phy-supply property This property has always been supported by the Linux driver; see commit 9f93ac8d4085 ("net-next: stmmac: Add dwmac-sun8i"). In fact, the original driver submission includes the phy-supply code but no mention of it in the binding, so the omission appears to be accidental. In addition, the property is documented in the binding for the previous hardware generation, allwinner,sun7i-a20-gmac. Document phy-supply in the binding to fix devicetree validation for the 25+ boards that already use this property. Fixes: 0441bde003be ("dt-bindings: net-next: Add DT bindings documentation for Allwinner dwmac-sun8i") Acked-by: Rob Herring Reviewed-by: Andre Przywara Signed-off-by: Samuel Holland Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml index 1432fda3b603..47bc2057e629 100644 --- a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml +++ b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml @@ -40,6 +40,9 @@ properties: clock-names: const: stmmaceth + phy-supply: + description: PHY regulator + syscon: $ref: /schemas/types.yaml#/definitions/phandle description: -- cgit From 91e2286160edd29d3fea8efff2dcda7df321878d Mon Sep 17 00:00:00 2001 From: Michał Grzelak Date: Thu, 29 Dec 2022 15:22:19 +0100 Subject: dt-bindings: net: marvell,orion-mdio: Fix examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As stated in marvell-orion-mdio.txt deleted in commit 0781434af811f ("dt-bindings: net: orion-mdio: Convert to JSON schema") if 'interrupts' property is present, width of 'reg' should be 0x84. Otherwise, width of 'reg' should be 0x4. Fix 'examples:' and add constraints checking whether 'interrupts' property is present and validate it against fixed values in reg. Signed-off-by: Michał Grzelak Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- .../bindings/net/marvell,orion-mdio.yaml | 30 +++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml b/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml index d2906b4a0f59..e35da8b01dc2 100644 --- a/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml +++ b/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml @@ -16,9 +16,6 @@ description: | 8k has a second unit which provides an interface with the xMDIO bus. This driver handles these interfaces. -allOf: - - $ref: "mdio.yaml#" - properties: compatible: enum: @@ -39,13 +36,38 @@ required: - compatible - reg +allOf: + - $ref: mdio.yaml# + + - if: + required: + - interrupts + + then: + properties: + reg: + items: + - items: + - $ref: /schemas/types.yaml#/definitions/cell + - const: 0x84 + + else: + properties: + reg: + items: + - items: + - $ref: /schemas/types.yaml#/definitions/cell + - enum: + - 0x4 + - 0x10 + unevaluatedProperties: false examples: - | mdio@d0072004 { compatible = "marvell,orion-mdio"; - reg = <0xd0072004 0x4>; + reg = <0xd0072004 0x84>; #address-cells = <1>; #size-cells = <0>; interrupts = <30>; -- cgit From 88603b6dc419445847923fcb7fe5080067a30f98 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Jan 2023 13:53:16 -0800 Subject: Linux 6.2-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a5133e422f69..c05b4fb7121e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit From 029085b8949f5d269ae2bbd14915407dd0c7f902 Mon Sep 17 00:00:00 2001 From: Chengen Du Date: Fri, 30 Dec 2022 11:04:32 +0800 Subject: NFS: Judge the file access cache's timestamp in rcu path If the user's login time is newer than the cache's timestamp, we expect the cache may be stale and need to clear. The stale cache will remain in the list's tail if no other users operate on that inode. Once the user accesses the inode, the stale cache will be returned in rcu path. Signed-off-by: Chengen Du Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ea1ceffa1d3a..d490d64a9ebd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -3023,6 +3023,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre * but do it without locking. */ struct nfs_inode *nfsi = NFS_I(inode); + u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; int err = -ECHILD; struct list_head *lh; @@ -3037,6 +3038,8 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre cache = NULL; if (cache == NULL) goto out; + if ((s64)(login_time - cache->timestamp) > 0) + goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; *mask = cache->mask; -- cgit From 5e9a7b9c2ea18551759833146a181b14835bfe39 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Jan 2023 20:17:23 -0500 Subject: NFS: Fix up a sparse warning sparse is warning about an incorrect RCU dereference. fs/nfs/dir.c:2965:56: warning: incorrect type in argument 1 (different address spaces) fs/nfs/dir.c:2965:56: expected struct cred const * fs/nfs/dir.c:2965:56: got struct cred const [noderef] __rcu *const cred Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d490d64a9ebd..f7e4a88d5d92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2957,12 +2957,14 @@ static u64 nfs_access_login_time(const struct task_struct *task, const struct cred *cred) { const struct task_struct *parent; + const struct cred *pcred; u64 ret; rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); - if (parent == task || cred_fscmp(parent->cred, cred) != 0) + pcred = rcu_dereference(parent->cred); + if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; } -- cgit From 3b754ed6d1cd90017e66e5cc16f3923e4a952ffc Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Mon, 19 Dec 2022 09:43:05 +0100 Subject: drm/meson: Reduce the FIFO lines held when AFBC is not used Having a bigger number of FIFO lines held after vsync is only useful to SoCs using AFBC to give time to the AFBC decoder to be reset, configured and enabled again. For SoCs not using AFBC this, on the contrary, is causing on some displays issues and a few pixels vertical offset in the displayed image. Conditionally increase the number of lines held after vsync only for SoCs using AFBC, leaving the default value for all the others. Fixes: 24e0d4058eff ("drm/meson: hold 32 lines after vsync to give time for AFBC start") Signed-off-by: Carlo Caione Acked-by: Martin Blumenstingl Acked-by: Neil Armstrong [narmstrong: added fixes tag] Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20221216-afbc_s905x-v1-0-033bebf780d9@baylibre.com --- drivers/gpu/drm/meson/meson_viu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_viu.c b/drivers/gpu/drm/meson/meson_viu.c index d4b907889a21..cd399b0b7181 100644 --- a/drivers/gpu/drm/meson/meson_viu.c +++ b/drivers/gpu/drm/meson/meson_viu.c @@ -436,15 +436,14 @@ void meson_viu_init(struct meson_drm *priv) /* Initialize OSD1 fifo control register */ reg = VIU_OSD_DDR_PRIORITY_URGENT | - VIU_OSD_HOLD_FIFO_LINES(31) | VIU_OSD_FIFO_DEPTH_VAL(32) | /* fifo_depth_val: 32*8=256 */ VIU_OSD_WORDS_PER_BURST(4) | /* 4 words in 1 burst */ VIU_OSD_FIFO_LIMITS(2); /* fifo_lim: 2*16=32 */ if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_G12A)) - reg |= VIU_OSD_BURST_LENGTH_32; + reg |= (VIU_OSD_BURST_LENGTH_32 | VIU_OSD_HOLD_FIFO_LINES(31)); else - reg |= VIU_OSD_BURST_LENGTH_64; + reg |= (VIU_OSD_BURST_LENGTH_64 | VIU_OSD_HOLD_FIFO_LINES(4)); writel_relaxed(reg, priv->io_base + _REG(VIU_OSD1_FIFO_CTRL_STAT)); writel_relaxed(reg, priv->io_base + _REG(VIU_OSD2_FIFO_CTRL_STAT)); -- cgit From 461ab10ef7e6ea9b41a0571a7fc6a72af9549a3c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 17 Nov 2022 10:43:21 +0800 Subject: ceph: switch to vfs_inode_has_locks() to fix file lock bug For the POSIX locks they are using the same owner, which is the thread id. And multiple POSIX locks could be merged into single one, so when checking whether the 'file' has locks may fail. For a file where some openers use locking and others don't is a really odd usage pattern though. Locks are like stoplights -- they only work if everyone pays attention to them. Just switch ceph_get_caps() to check whether any locks are set on the inode. If there are POSIX/OFD/FLOCK locks on the file at the time, we should set CHECK_FILELOCK, regardless of what fd was used to set the lock. Fixes: ff5d913dfc71 ("ceph: return -EIO if read/write against filp that lost file locks") Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- fs/ceph/locks.c | 4 ---- fs/ceph/super.h | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 4b159f97fe7b..f75ad432f375 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2913,7 +2913,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got while (true) { flags &= CEPH_FILE_MODE_MASK; - if (atomic_read(&fi->num_locks)) + if (vfs_inode_has_locks(inode)) flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index f3b461c708a8..6b3b8c299c17 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -32,18 +32,14 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct ceph_file_info *fi = dst->fl_file->private_data; struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); - atomic_inc(&fi->num_locks); } static void ceph_fl_release_lock(struct file_lock *fl) { - struct ceph_file_info *fi = fl->fl_file->private_data; struct inode *inode = file_inode(fl->fl_file); struct ceph_inode_info *ci = ceph_inode(inode); - atomic_dec(&fi->num_locks); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 30bdb391a0dc..0ed3be75bb9a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -790,7 +790,6 @@ struct ceph_file_info { struct list_head rw_contexts; u32 filp_gen; - atomic_t num_locks; }; struct ceph_dir_file_info { -- cgit From 8e1858710d9a71d88acd922f2e95d1eddb90eea0 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 17 Nov 2022 10:57:53 +0800 Subject: ceph: avoid use-after-free in ceph_fl_release_lock() When ceph releasing the file_lock it will try to get the inode pointer from the fl->fl_file, which the memory could already be released by another thread in filp_close(). Because in VFS layer the fl->fl_file doesn't increase the file's reference counter. Will switch to use ceph dedicate lock info to track the inode. And in ceph_fl_release_lock() we should skip all the operations if the fl->fl_u.ceph.inode is not set, which should come from the request file_lock. And we will set fl->fl_u.ceph.inode when inserting it to the inode lock list, which is when copying the lock. Link: https://tracker.ceph.com/issues/57986 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 20 ++++++++++++++++++-- include/linux/fs.h | 3 +++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 6b3b8c299c17..9c8dc8a55e7e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -34,18 +34,34 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); + dst->fl_u.ceph.inode = igrab(inode); } +/* + * Do not use the 'fl->fl_file' in release function, which + * is possibly already released by another thread. + */ static void ceph_fl_release_lock(struct file_lock *fl) { - struct inode *inode = file_inode(fl->fl_file); - struct ceph_inode_info *ci = ceph_inode(inode); + struct inode *inode = fl->fl_u.ceph.inode; + struct ceph_inode_info *ci; + + /* + * If inode is NULL it should be a request file_lock, + * nothing we can do. + */ + if (!inode) + return; + + ci = ceph_inode(inode); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; spin_unlock(&ci->i_ceph_lock); } + fl->fl_u.ceph.inode = NULL; + iput(inode); } static const struct file_lock_operations ceph_fl_lock_ops = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 066555ad1bf8..c1769a2c5d70 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1119,6 +1119,9 @@ struct file_lock { int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; + struct { + struct inode *inode; + } ceph; } fl_u; } __randomize_layout; -- cgit From 694175cd8a1643cde3acb45c9294bca44a8e08e9 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 2 Jan 2023 12:20:39 +0400 Subject: gpio: sifive: Fix refcount leak in sifive_gpio_probe of_irq_find_parent() returns a node pointer with refcount incremented, We should use of_node_put() on it when not needed anymore. Add missing of_node_put() to avoid refcount leak. Fixes: 96868dce644d ("gpio/sifive: Add GPIO driver for SiFive SoCs") Signed-off-by: Miaoqian Lin Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sifive.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 238f3210970c..bc5660f61c57 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -215,6 +215,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) return -ENODEV; } parent = irq_find_host(irq_parent); + of_node_put(irq_parent); if (!parent) { dev_err(dev, "no IRQ parent domain\n"); return -ENODEV; -- cgit From a2965c7be0522eaa18808684b7b82b248515511b Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 1 Jan 2023 16:57:43 -0500 Subject: net: sched: atm: dont intepret cls results when asked to drop If asked to drop a packet via TC_ACT_SHOT it is unsafe to assume res.class contains a valid pointer Fixes: b0188d4dbe5f ("[NET_SCHED]: sch_atm: Lindent") Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index f52255fea652..4a981ca90b0b 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -393,10 +393,13 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, result = tcf_classify(skb, NULL, fl, &res, true); if (result < 0) continue; + if (result == TC_ACT_SHOT) + goto done; + flow = (struct atm_flow_data *)res.class; if (!flow) flow = lookup_flow(sch, res.classid); - goto done; + goto drop; } } flow = NULL; -- cgit From caa4b35b4317d5147b3ab0fbdc9c075c7d2e9c12 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 1 Jan 2023 16:57:44 -0500 Subject: net: sched: cbq: dont intepret cls results when asked to drop If asked to drop a packet via TC_ACT_SHOT it is unsafe to assume that res.class contains a valid pointer Sample splat reported by Kyle Zeng [ 5.405624] 0: reclassify loop, rule prio 0, protocol 800 [ 5.406326] ================================================================== [ 5.407240] BUG: KASAN: slab-out-of-bounds in cbq_enqueue+0x54b/0xea0 [ 5.407987] Read of size 1 at addr ffff88800e3122aa by task poc/299 [ 5.408731] [ 5.408897] CPU: 0 PID: 299 Comm: poc Not tainted 5.10.155+ #15 [ 5.409516] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 5.410439] Call Trace: [ 5.410764] dump_stack+0x87/0xcd [ 5.411153] print_address_description+0x7a/0x6b0 [ 5.411687] ? vprintk_func+0xb9/0xc0 [ 5.411905] ? printk+0x76/0x96 [ 5.412110] ? cbq_enqueue+0x54b/0xea0 [ 5.412323] kasan_report+0x17d/0x220 [ 5.412591] ? cbq_enqueue+0x54b/0xea0 [ 5.412803] __asan_report_load1_noabort+0x10/0x20 [ 5.413119] cbq_enqueue+0x54b/0xea0 [ 5.413400] ? __kasan_check_write+0x10/0x20 [ 5.413679] __dev_queue_xmit+0x9c0/0x1db0 [ 5.413922] dev_queue_xmit+0xc/0x10 [ 5.414136] ip_finish_output2+0x8bc/0xcd0 [ 5.414436] __ip_finish_output+0x472/0x7a0 [ 5.414692] ip_finish_output+0x5c/0x190 [ 5.414940] ip_output+0x2d8/0x3c0 [ 5.415150] ? ip_mc_finish_output+0x320/0x320 [ 5.415429] __ip_queue_xmit+0x753/0x1760 [ 5.415664] ip_queue_xmit+0x47/0x60 [ 5.415874] __tcp_transmit_skb+0x1ef9/0x34c0 [ 5.416129] tcp_connect+0x1f5e/0x4cb0 [ 5.416347] tcp_v4_connect+0xc8d/0x18c0 [ 5.416577] __inet_stream_connect+0x1ae/0xb40 [ 5.416836] ? local_bh_enable+0x11/0x20 [ 5.417066] ? lock_sock_nested+0x175/0x1d0 [ 5.417309] inet_stream_connect+0x5d/0x90 [ 5.417548] ? __inet_stream_connect+0xb40/0xb40 [ 5.417817] __sys_connect+0x260/0x2b0 [ 5.418037] __x64_sys_connect+0x76/0x80 [ 5.418267] do_syscall_64+0x31/0x50 [ 5.418477] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 5.418770] RIP: 0033:0x473bb7 [ 5.418952] Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 18 89 54 24 0c 48 89 34 24 89 [ 5.420046] RSP: 002b:00007fffd20eb0f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a [ 5.420472] RAX: ffffffffffffffda RBX: 00007fffd20eb578 RCX: 0000000000473bb7 [ 5.420872] RDX: 0000000000000010 RSI: 00007fffd20eb110 RDI: 0000000000000007 [ 5.421271] RBP: 00007fffd20eb150 R08: 0000000000000001 R09: 0000000000000004 [ 5.421671] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 [ 5.422071] R13: 00007fffd20eb568 R14: 00000000004fc740 R15: 0000000000000002 [ 5.422471] [ 5.422562] Allocated by task 299: [ 5.422782] __kasan_kmalloc+0x12d/0x160 [ 5.423007] kasan_kmalloc+0x5/0x10 [ 5.423208] kmem_cache_alloc_trace+0x201/0x2e0 [ 5.423492] tcf_proto_create+0x65/0x290 [ 5.423721] tc_new_tfilter+0x137e/0x1830 [ 5.423957] rtnetlink_rcv_msg+0x730/0x9f0 [ 5.424197] netlink_rcv_skb+0x166/0x300 [ 5.424428] rtnetlink_rcv+0x11/0x20 [ 5.424639] netlink_unicast+0x673/0x860 [ 5.424870] netlink_sendmsg+0x6af/0x9f0 [ 5.425100] __sys_sendto+0x58d/0x5a0 [ 5.425315] __x64_sys_sendto+0xda/0xf0 [ 5.425539] do_syscall_64+0x31/0x50 [ 5.425764] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 5.426065] [ 5.426157] The buggy address belongs to the object at ffff88800e312200 [ 5.426157] which belongs to the cache kmalloc-128 of size 128 [ 5.426955] The buggy address is located 42 bytes to the right of [ 5.426955] 128-byte region [ffff88800e312200, ffff88800e312280) [ 5.427688] The buggy address belongs to the page: [ 5.427992] page:000000009875fabc refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xe312 [ 5.428562] flags: 0x100000000000200(slab) [ 5.428812] raw: 0100000000000200 dead000000000100 dead000000000122 ffff888007843680 [ 5.429325] raw: 0000000000000000 0000000000100010 00000001ffffffff ffff88800e312401 [ 5.429875] page dumped because: kasan: bad access detected [ 5.430214] page->mem_cgroup:ffff88800e312401 [ 5.430471] [ 5.430564] Memory state around the buggy address: [ 5.430846] ffff88800e312180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 5.431267] ffff88800e312200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc [ 5.431705] >ffff88800e312280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 5.432123] ^ [ 5.432391] ffff88800e312300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc [ 5.432810] ffff88800e312380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 5.433229] ================================================================== [ 5.433648] Disabling lock debugging due to kernel taint Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Kyle Zeng Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6568e17c4c63..36db5f6782f2 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -230,6 +230,8 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) result = tcf_classify(skb, NULL, fl, &res, true); if (!fl || result < 0) goto fallback; + if (result == TC_ACT_SHOT) + return NULL; cl = (void *)res.class; if (!cl) { @@ -250,8 +252,6 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; - case TC_ACT_SHOT: - return NULL; case TC_ACT_RECLASSIFY: return cbq_reclassify(skb, cl); } -- cgit From 43d253781f6321c6a07a5fe4ee72103a679a5f6b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Jan 2023 23:17:37 -0800 Subject: net: sched: htb: fix htb_classify() kernel-doc Fix W=1 kernel-doc warning: net/sched/sch_htb.c:214: warning: expecting prototype for htb_classify(). Prototype was for HTB_DIRECT() instead by moving the HTB_DIRECT() macro above the function. Add kernel-doc notation for function parameters as well. Signed-off-by: Randy Dunlap Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index e5b4bbf3ce3d..2238edece1a4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -199,8 +199,14 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle) { return (unsigned long)htb_find(handle, sch); } + +#define HTB_DIRECT ((struct htb_class *)-1L) + /** * htb_classify - classify a packet into class + * @skb: the socket buffer + * @sch: the active queue discipline + * @qerr: pointer for returned status code * * It returns NULL if the packet should be dropped or -1 if the packet * should be passed directly thru. In all other cases leaf class is returned. @@ -211,8 +217,6 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle) * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful * then finish and return direct queue. */ -#define HTB_DIRECT ((struct htb_class *)-1L) - static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { -- cgit From 06bf62944144a92d83dd14fd1378d2a288259561 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 2 Jan 2023 08:55:56 +0200 Subject: vxlan: Fix memory leaks in error path The memory allocated by vxlan_vnigroup_init() is not freed in the error path, leading to memory leaks [1]. Fix by calling vxlan_vnigroup_uninit() in the error path. The leaks can be reproduced by annotating gro_cells_init() with ALLOW_ERROR_INJECTION() and then running: # echo "100" > /sys/kernel/debug/fail_function/probability # echo "1" > /sys/kernel/debug/fail_function/times # echo "gro_cells_init" > /sys/kernel/debug/fail_function/inject # printf %#x -12 > /sys/kernel/debug/fail_function/gro_cells_init/retval # ip link add name vxlan0 type vxlan dstport 4789 external vnifilter RTNETLINK answers: Cannot allocate memory [1] unreferenced object 0xffff88810db84a00 (size 512): comm "ip", pid 330, jiffies 4295010045 (age 66.016s) hex dump (first 32 bytes): f8 d5 76 0e 81 88 ff ff 01 00 00 00 00 00 00 02 ..v............. 03 00 04 00 48 00 00 00 00 00 00 01 04 00 01 00 ....H........... backtrace: [] kmalloc_trace+0x2a/0x60 [] vxlan_vnigroup_init+0x4c/0x160 [] vxlan_init+0x1ae/0x280 [] register_netdevice+0x57a/0x16d0 [] __vxlan_dev_create+0x7c7/0xa50 [] vxlan_newlink+0xd6/0x130 [] __rtnl_newlink+0x112b/0x18a0 [] rtnl_newlink+0x6c/0xa0 [] rtnetlink_rcv_msg+0x43f/0xd40 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x53f/0x810 [] netlink_sendmsg+0x958/0xe70 [] ____sys_sendmsg+0x78f/0xa90 [] ___sys_sendmsg+0x13a/0x1e0 [] __sys_sendmsg+0x11c/0x1f0 [] do_syscall_64+0x38/0x80 unreferenced object 0xffff88810e76d5f8 (size 192): comm "ip", pid 330, jiffies 4295010045 (age 66.016s) hex dump (first 32 bytes): 04 00 00 00 00 00 00 00 db e1 4f e7 00 00 00 00 ..........O..... 08 d6 76 0e 81 88 ff ff 08 d6 76 0e 81 88 ff ff ..v.......v..... backtrace: [] __kmalloc_node+0x4e/0x90 [] kvmalloc_node+0xa6/0x1f0 [] bucket_table_alloc.isra.0+0x83/0x460 [] rhashtable_init+0x43b/0x7c0 [] vxlan_vnigroup_init+0x6c/0x160 [] vxlan_init+0x1ae/0x280 [] register_netdevice+0x57a/0x16d0 [] __vxlan_dev_create+0x7c7/0xa50 [] vxlan_newlink+0xd6/0x130 [] __rtnl_newlink+0x112b/0x18a0 [] rtnl_newlink+0x6c/0xa0 [] rtnetlink_rcv_msg+0x43f/0xd40 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x53f/0x810 [] netlink_sendmsg+0x958/0xe70 [] ____sys_sendmsg+0x78f/0xa90 Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device") Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 92224b36787a..b1b179effe2a 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2917,16 +2917,23 @@ static int vxlan_init(struct net_device *dev) vxlan_vnigroup_init(vxlan); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; + if (!dev->tstats) { + err = -ENOMEM; + goto err_vnigroup_uninit; + } err = gro_cells_init(&vxlan->gro_cells, dev); - if (err) { - free_percpu(dev->tstats); - return err; - } + if (err) + goto err_free_percpu; return 0; + +err_free_percpu: + free_percpu(dev->tstats); +err_vnigroup_uninit: + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) + vxlan_vnigroup_uninit(vxlan); + return err; } static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) -- cgit From 588ab2dc25f60efeb516b4abedb6c551949cc185 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 2 Jan 2023 13:12:15 +0100 Subject: net: sparx5: Fix reading of the MAC address There is an issue with the checking of the return value of 'of_get_mac_address', which returns 0 on success and negative value on failure. The driver interpretated the result the opposite way. Therefore if there was a MAC address defined in the DT, then the driver was generating a random MAC address otherwise it would use address 0. Fix this by checking correctly the return value of 'of_get_mac_address' Fixes: b74ef9f9cb91 ("net: sparx5: Do not use mac_addr uninitialized in mchp_sparx5_probe()") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/sparx5/sparx5_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c index d25f4f09faa0..3c5d4fe99373 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c @@ -834,7 +834,7 @@ static int mchp_sparx5_probe(struct platform_device *pdev) if (err) goto cleanup_config; - if (!of_get_mac_address(np, sparx5->base_mac)) { + if (of_get_mac_address(np, sparx5->base_mac)) { dev_info(sparx5->dev, "MAC addr was not set, use random MAC\n"); eth_random_addr(sparx5->base_mac); sparx5->base_mac[5] = 0; -- cgit From a31d47be64b9b74f8cfedffe03e0a8a1f9e51f23 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 30 Dec 2022 13:24:37 +0100 Subject: netfilter: ipset: fix hash:net,port,net hang with /0 subnet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hash:net,port,net set type supports /0 subnets. However, the patch commit 5f7b51bf09baca8e titled "netfilter: ipset: Limit the maximal range of consecutive elements to add/delete" did not take into account it and resulted in an endless loop. The bug is actually older but the patch 5f7b51bf09baca8e brings it out earlier. Handle /0 subnets properly in hash:net,port,net set types. Fixes: 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") Reported-by: Марк Коренберг Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netportnet.c | 40 +++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 19bcdb3141f6..005a7ce87217 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -173,17 +173,26 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } +static u32 +hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ + if (from == 0 && to == UINT_MAX) { + *cidr = 0; + return to; + } + return ip_set_range_to_cidr(from, to, cidr); +} + static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netportnet4 *h = set->data; + struct hash_netportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2, ipn; - u64 n = 0, m = 0; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; int ret; @@ -285,19 +294,6 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -310,13 +306,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], do { e.ip[0] = htonl(ip); - ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); + ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]); for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip[1] = htonl(ip2); - ip2 = ip_set_range_to_cidr(ip2, ip2_to, - &e.cidr[1]); + if (i > IPSET_MAX_RANGE) { + hash_netportnet4_data_next(&h->next, + &e); + return -ERANGE; + } + ip2 = hash_netportnet4_range_to_cidr(ip2, + ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; -- cgit From 5e29dc36bd5e2166b834ceb19990d9e68a734d7d Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 30 Dec 2022 13:24:38 +0100 Subject: netfilter: ipset: Rework long task execution when adding/deleting entries When adding/deleting large number of elements in one step in ipset, it can take a reasonable amount of time and can result in soft lockup errors. The patch 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") tried to fix it by limiting the max elements to process at all. However it was not enough, it is still possible that we get hung tasks. Lowering the limit is not reasonable, so the approach in this patch is as follows: rely on the method used at resizing sets and save the state when we reach a smaller internal batch limit, unlock/lock and proceed from the saved state. Thus we can avoid long continuous tasks and at the same time removed the limit to add/delete large number of elements in one step. The nfnl mutex is held during the whole operation which prevents one to issue other ipset commands in parallel. Fixes: 5f7b51bf09ba ("netfilter: ipset: Limit the maximal range of consecutive elements to add/delete") Reported-by: syzbot+9204e7399656300bf271@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 2 +- net/netfilter/ipset/ip_set_core.c | 7 ++++--- net/netfilter/ipset/ip_set_hash_ip.c | 14 +++++++------- net/netfilter/ipset/ip_set_hash_ipmark.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_ipport.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_ipportip.c | 13 +++++++------ net/netfilter/ipset/ip_set_hash_ipportnet.c | 13 ++++++++----- net/netfilter/ipset/ip_set_hash_net.c | 17 +++++++---------- net/netfilter/ipset/ip_set_hash_netiface.c | 15 ++++++--------- net/netfilter/ipset/ip_set_hash_netnet.c | 23 +++++++---------------- net/netfilter/ipset/ip_set_hash_netport.c | 19 +++++++------------ 11 files changed, 68 insertions(+), 81 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ab934ad951a8..e8c350a3ade1 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -197,7 +197,7 @@ struct ip_set_region { }; /* Max range where every element is added/deleted in one step */ -#define IPSET_MAX_RANGE (1<<20) +#define IPSET_MAX_RANGE (1<<14) /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e7ba5b6dd2b7..46ebee9400da 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1698,9 +1698,10 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; - } while (ret == -EAGAIN && - set->variant->resize && - (ret = set->variant->resize(set, retried)) == 0); + } while (ret == -ERANGE || + (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index e30513cefd90..c9f4e3859663 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -100,11 +100,11 @@ static int hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ip4 *h = set->data; + struct hash_ip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, hosts; + u32 ip = 0, ip_to = 0, hosts, i = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) @@ -149,14 +149,14 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - /* 64bit division is not allowed on 32bit */ - if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to;) { + for (; ip <= ip_to; i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 153de3457423..a22ec1a6f6ec 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -97,11 +97,11 @@ static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipmark4 *h = set->data; + struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0; + u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -148,13 +148,14 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } - if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_ipmark4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 2ffbd0b78a8c..e977b5a9c48d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -112,11 +112,11 @@ static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipport4 *h = set->data; + struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -184,17 +184,18 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 334fb1ad0e86..39a01934b153 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -108,11 +108,11 @@ static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportip4 *h = set->data; + struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; @@ -180,17 +180,18 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_ipportip4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7df94f437f60..5c6de605a9fb 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -160,12 +160,12 @@ static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_ipportnet4 *h = set->data; + struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -253,9 +253,6 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } - if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; - ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); @@ -282,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], for (; p <= port_to; p++) { e.port = htons(p); do { + i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; + if (i > IPSET_MAX_RANGE) { + hash_ipportnet4_data_next(&h->next, + &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 1422739d9aa2..ce0a9ce5a91f 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -136,11 +136,11 @@ static int hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_net4 *h = set->data; + struct hash_net4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,19 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_net4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 9810f5bf63f5..031073286236 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0, ipn, n = 0; + u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,19 +256,16 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); - n++; - } while (ipn++ < ip_to); - - if (n > IPSET_MAX_RANGE) - return -ERANGE; if (retried) ip = ntohl(h->next.ip); do { + i++; e.ip = htonl(ip); + if (i > IPSET_MAX_RANGE) { + hash_netiface4_data_next(&h->next, &e); + return -ERANGE; + } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index cdfb78c6e0d3..8fbe649c9dd3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -166,13 +166,12 @@ static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netnet4 *h = set->data; + struct hash_netnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; - u64 n = 0, m = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -248,19 +247,6 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); - n++; - } while (ipn++ < ip_to); - ipn = ip2_from; - do { - ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); - m++; - } while (ipn++ < ip2_to); - - if (n*m > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); @@ -273,7 +259,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip[0] = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); do { + i++; e.ip[1] = htonl(ip2); + if (i > IPSET_MAX_RANGE) { + hash_netnet4_data_next(&h->next, &e); + return -ERANGE; + } ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 09cf72eb37f8..d1a0628df4ef 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -154,12 +154,11 @@ static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - const struct hash_netport4 *h = set->data; + struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; - u64 n = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; @@ -236,14 +235,6 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } - ipn = ip; - do { - ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); - n++; - } while (ipn++ < ip_to); - - if (n*(port_to - port + 1) > IPSET_MAX_RANGE) - return -ERANGE; if (retried) { ip = ntohl(h->next.ip); @@ -255,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; - for (; p <= port_to; p++) { + for (; p <= port_to; p++, i++) { e.port = htons(p); + if (i > IPSET_MAX_RANGE) { + hash_netport4_data_next(&h->next, &e); + return -ERANGE; + } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; -- cgit From a764da46cd15f8b40292d2c0b29c4bf9a3e66c7e Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 9 Nov 2022 17:19:05 +0800 Subject: drm/virtio: Fix memory leak in virtio_gpu_object_create() The virtio_gpu_object_shmem_init() will alloc memory and save it in @ents, so when virtio_gpu_array_alloc() fails, this memory should be freed, this patch fixes it. Fixes: e7fef0923303 ("drm/virtio: Simplify error handling of virtio_gpu_object_create()") Signed-off-by: Xiu Jianfeng Reviewed-by: Dmitry Osipenko Signed-off-by: Dmitry Osipenko Link: https://patchwork.freedesktop.org/patch/msgid/20221109091905.55451-1-xiujianfeng@huawei.com --- drivers/gpu/drm/virtio/virtgpu_object.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_object.c b/drivers/gpu/drm/virtio/virtgpu_object.c index 8d7728181de0..c7e74cf13022 100644 --- a/drivers/gpu/drm/virtio/virtgpu_object.c +++ b/drivers/gpu/drm/virtio/virtgpu_object.c @@ -184,7 +184,7 @@ int virtio_gpu_object_create(struct virtio_gpu_device *vgdev, struct virtio_gpu_object_array *objs = NULL; struct drm_gem_shmem_object *shmem_obj; struct virtio_gpu_object *bo; - struct virtio_gpu_mem_entry *ents; + struct virtio_gpu_mem_entry *ents = NULL; unsigned int nents; int ret; @@ -210,7 +210,7 @@ int virtio_gpu_object_create(struct virtio_gpu_device *vgdev, ret = -ENOMEM; objs = virtio_gpu_array_alloc(1); if (!objs) - goto err_put_id; + goto err_free_entry; virtio_gpu_array_add_obj(objs, &bo->base.base); ret = virtio_gpu_array_lock_resv(objs); @@ -239,6 +239,8 @@ int virtio_gpu_object_create(struct virtio_gpu_device *vgdev, err_put_objs: virtio_gpu_array_put_free(objs); +err_free_entry: + kvfree(ents); err_put_id: virtio_gpu_resource_id_put(vgdev, bo->hw_res_handle); err_free_gem: -- cgit From 03dec92c4f788c54a7c01b40a018f601eb8a6c52 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 23 Nov 2022 03:13:03 +0300 Subject: drm/scheduler: Fix lockup in drm_sched_entity_kill() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drm_sched_entity_kill() is invoked twice by drm_sched_entity_destroy() while userspace process is exiting or being killed. First time it's invoked when sched entity is flushed and second time when entity is released. This causes a lockup within wait_for_completion(entity_idle) due to how completion API works. Calling wait_for_completion() more times than complete() was invoked is a error condition that causes lockup because completion internally uses counter for complete/wait calls. The complete_all() must be used instead in such cases. This patch fixes lockup of Panfrost driver that is reproducible by killing any application in a middle of 3d drawing operation. Fixes: 2fdb8a8f07c2 ("drm/scheduler: rework entity flush, kill and fini") Signed-off-by: Dmitry Osipenko Reviewed-by: Christian König Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://patchwork.freedesktop.org/patch/msgid/20221123001303.533968-1-dmitry.osipenko@collabora.com --- drivers/gpu/drm/scheduler/sched_entity.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index fe09e5be79bd..15d04a0ec623 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -81,7 +81,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, init_completion(&entity->entity_idle); /* We start in an idle state. */ - complete(&entity->entity_idle); + complete_all(&entity->entity_idle); spin_lock_init(&entity->rq_lock); spsc_queue_init(&entity->job_queue); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 31f3a1267be4..fd22d753b4ed 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -987,7 +987,7 @@ static int drm_sched_main(void *param) sched_job = drm_sched_entity_pop_job(entity); if (!sched_job) { - complete(&entity->entity_idle); + complete_all(&entity->entity_idle); continue; } @@ -998,7 +998,7 @@ static int drm_sched_main(void *param) trace_drm_run_job(sched_job, entity); fence = sched->ops->run_job(sched_job); - complete(&entity->entity_idle); + complete_all(&entity->entity_idle); drm_sched_fence_scheduled(s_fence); if (!IS_ERR_OR_NULL(fence)) { -- cgit From 0a6564ebd953c4590663c9a3c99a3ea9920ade6f Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 29 Dec 2022 13:09:00 +0400 Subject: perf tools: Fix resources leak in perf_data__open_dir() In perf_data__open_dir(), opendir() opens the directory stream. Add missing closedir() to release it after use. Fixes: eb6176709b235b96 ("perf data: Add perf_data__open_dir_data function") Reviewed-by: Adrian Hunter Signed-off-by: Miaoqian Lin Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20221229090903.1402395-1-linmq006@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index a7f68c309545..fc16299c915f 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -132,6 +132,7 @@ int perf_data__open_dir(struct perf_data *data) file->size = st.st_size; } + closedir(dir); if (!files) return -EINVAL; @@ -140,6 +141,7 @@ int perf_data__open_dir(struct perf_data *data) return 0; out_err: + closedir(dir); close_dir(files, nr); return ret; } -- cgit From f685dd7a8025f2554f73748cfdb8143a21fb92c7 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Mon, 2 Jan 2023 14:57:30 +0100 Subject: fbdev: matroxfb: G200eW: Increase max memory from 1 MB to 16 MB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 62d89a7d49af ("video: fbdev: matroxfb: set maxvram of vbG200eW to the same as vbG200 to avoid black screen") accidently decreases the maximum memory size for the Matrox G200eW (102b:0532) from 8 MB to 1 MB by missing one zero. This caused the driver initialization to fail with the messages below, as the minimum required VRAM size is 2 MB: [ 9.436420] matroxfb: Matrox MGA-G200eW (PCI) detected [ 9.444502] matroxfb: cannot determine memory size [ 9.449316] matroxfb: probe of 0000:0a:03.0 failed with error -1 So, add the missing 0 to make it the intended 16 MB. Successfully tested on the Dell PowerEdge R910/0KYD3D, BIOS 2.10.0 08/29/2013, that the warning is gone. While at it, add a leading 0 to the maxdisplayable entry, so it’s aligned properly. The value could probably also be increased from 8 MB to 16 MB, as the G200 uses the same values, but I have not checked any datasheet. Note, matroxfb is obsolete and superseded by the maintained DRM driver mga200, which is used by default on most systems where both drivers are available. Therefore, on most systems it was only a cosmetic issue. Fixes: 62d89a7d49af ("video: fbdev: matroxfb: set maxvram of vbG200eW to the same as vbG200 to avoid black screen") Link: https://lore.kernel.org/linux-fbdev/972999d3-b75d-5680-fcef-6e6905c52ac5@suse.de/T/#mb6953a9995ebd18acc8552f99d6db39787aec775 Cc: it+linux-fbdev@molgen.mpg.de Cc: Z. Liu Cc: Rich Felker Cc: stable@vger.kernel.org Signed-off-by: Paul Menzel Signed-off-by: Helge Deller --- drivers/video/fbdev/matrox/matroxfb_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/matrox/matroxfb_base.c b/drivers/video/fbdev/matrox/matroxfb_base.c index 0d3cee7ae726..a043a737ea9f 100644 --- a/drivers/video/fbdev/matrox/matroxfb_base.c +++ b/drivers/video/fbdev/matrox/matroxfb_base.c @@ -1378,8 +1378,8 @@ static struct video_board vbG200 = { .lowlevel = &matrox_G100 }; static struct video_board vbG200eW = { - .maxvram = 0x100000, - .maxdisplayable = 0x800000, + .maxvram = 0x1000000, + .maxdisplayable = 0x0800000, .accelID = FB_ACCEL_MATROX_MGAG200, .lowlevel = &matrox_G100 }; -- cgit From a8f54d940196c8bd9aced9c82557fdc63baefb02 Mon Sep 17 00:00:00 2001 From: Eric Lin Date: Sat, 31 Dec 2022 05:27:31 +0000 Subject: perf tools riscv: Fix build error on riscv due to missing header for 'struct perf_sample' Since the definition of 'struct perf_sample' has been moved to sample.h, we need to include this header file to fix the build error as follows: arch/riscv/util/unwind-libdw.c: In function 'libdw__arch_set_initial_registers': arch/riscv/util/unwind-libdw.c:12:50: error: invalid use of undefined type 'struct perf_sample' 12 | struct regs_dump *user_regs = &ui->sample->user_regs; | ^~ Fixes: 9823147da6c893d9 ("perf tools: Move 'struct perf_sample' to a separate header file to disentangle headers") Signed-off-by: Eric Lin Cc: Alexander Shishkin Cc: greentime.hu@sifive.com Cc: Jiri Olsa Cc: linux-riscv@lists.infradead.org Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Vincent Chen Link: https://lore.kernel.org/r/20221231052731.24908-1-eric.lin@sifive.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/riscv/util/unwind-libdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/riscv/util/unwind-libdw.c b/tools/perf/arch/riscv/util/unwind-libdw.c index 19536e172850..54a198714eb8 100644 --- a/tools/perf/arch/riscv/util/unwind-libdw.c +++ b/tools/perf/arch/riscv/util/unwind-libdw.c @@ -4,7 +4,7 @@ #include #include "../../util/unwind-libdw.h" #include "../../util/perf_regs.h" -#include "../../util/event.h" +#include "../../util/sample.h" bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg) { -- cgit From 55c41f2e4f7e81e48f3ecc9fba1e316e770213f2 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 26 Dec 2022 08:57:03 +0000 Subject: perf help: Use HAVE_LIBTRACEEVENT to filter out unsupported commands Commands such as kmem, kwork, lock, sched, trace and timechart depend on libtraceevent, these commands need to be isolated using HAVE_LIBTRACEEVENT macro when cmdlist generation. The output of the generate-cmdlist.sh script is as follows: # ./util/generate-cmdlist.sh /* Automatically generated by ./util/generate-cmdlist.sh */ struct cmdname_help { char name[16]; char help[80]; }; static struct cmdname_help common_cmds[] = { {"annotate", "Read perf.data (created by perf record) and display annotated code"}, {"archive", "Create archive with object files with build-ids found in perf.data file"}, {"bench", "General framework for benchmark suites"}, {"buildid-cache", "Manage build-id cache."}, {"buildid-list", "List the buildids in a perf.data file"}, {"c2c", "Shared Data C2C/HITM Analyzer."}, {"config", "Get and set variables in a configuration file."}, {"daemon", "Run record sessions on background"}, {"data", "Data file related processing"}, {"diff", "Read perf.data files and display the differential profile"}, {"evlist", "List the event names in a perf.data file"}, {"ftrace", "simple wrapper for kernel's ftrace functionality"}, {"inject", "Filter to augment the events stream with additional information"}, {"iostat", "Show I/O performance metrics"}, {"kallsyms", "Searches running kernel for symbols"}, {"kvm", "Tool to trace/measure kvm guest os"}, {"list", "List all symbolic event types"}, {"mem", "Profile memory accesses"}, {"record", "Run a command and record its profile into perf.data"}, {"report", "Read perf.data (created by perf record) and display the profile"}, {"script", "Read perf.data (created by perf record) and display trace output"}, {"stat", "Run a command and gather performance counter statistics"}, {"test", "Runs sanity tests."}, {"top", "System profiling tool."}, {"version", "display the version of perf binary"}, #ifdef HAVE_LIBELF_SUPPORT {"probe", "Define new dynamic tracepoints"}, #endif /* HAVE_LIBELF_SUPPORT */ #if defined(HAVE_LIBTRACEEVENT) && (defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)) {"trace", "strace inspired tool"}, #endif /* HAVE_LIBTRACEEVENT && (HAVE_LIBAUDIT_SUPPORT || HAVE_SYSCALL_TABLE_SUPPORT) */ #ifdef HAVE_LIBTRACEEVENT {"kmem", "Tool to trace/measure kernel memory properties"}, {"kwork", "Tool to trace/measure kernel work properties (latencies)"}, {"lock", "Analyze lock events"}, {"sched", "Tool to trace/measure scheduler properties (latencies)"}, {"timechart", "Tool to visualize total system behavior during a workload"}, #endif /* HAVE_LIBTRACEEVENT */ }; Fixes: 378ef0f5d9d7f465 ("perf build: Use libtraceevent from the system") Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20221226085703.95081-1-yangjihong1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/command-list.txt | 10 +++++----- tools/perf/util/generate-cmdlist.sh | 19 +++++++++++++++++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index 8fcab5ad00c5..e8d2762adade 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -16,20 +16,20 @@ perf-ftrace mainporcelain common perf-inject mainporcelain common perf-iostat mainporcelain common perf-kallsyms mainporcelain common -perf-kmem mainporcelain common +perf-kmem mainporcelain traceevent perf-kvm mainporcelain common -perf-kwork mainporcelain common +perf-kwork mainporcelain traceevent perf-list mainporcelain common -perf-lock mainporcelain common +perf-lock mainporcelain traceevent perf-mem mainporcelain common perf-probe mainporcelain full perf-record mainporcelain common perf-report mainporcelain common -perf-sched mainporcelain common +perf-sched mainporcelain traceevent perf-script mainporcelain common perf-stat mainporcelain common perf-test mainporcelain common -perf-timechart mainporcelain common +perf-timechart mainporcelain traceevent perf-top mainporcelain common perf-trace mainporcelain audit perf-version mainporcelain common diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh index c3cef36d4176..1b5140e5ce99 100755 --- a/tools/perf/util/generate-cmdlist.sh +++ b/tools/perf/util/generate-cmdlist.sh @@ -38,7 +38,7 @@ do done echo "#endif /* HAVE_LIBELF_SUPPORT */" -echo "#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)" +echo "#if defined(HAVE_LIBTRACEEVENT) && (defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT))" sed -n -e 's/^perf-\([^ ]*\)[ ].* audit*/\1/p' command-list.txt | sort | while read cmd @@ -51,5 +51,20 @@ do p }' "Documentation/perf-$cmd.txt" done -echo "#endif /* HAVE_LIBELF_SUPPORT */" +echo "#endif /* HAVE_LIBTRACEEVENT && (HAVE_LIBAUDIT_SUPPORT || HAVE_SYSCALL_TABLE_SUPPORT) */" + +echo "#ifdef HAVE_LIBTRACEEVENT" +sed -n -e 's/^perf-\([^ ]*\)[ ].* traceevent.*/\1/p' command-list.txt | +sort | +while read cmd +do + sed -n ' + /^NAME/,/perf-'"$cmd"'/H + ${ + x + s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ + p + }' "Documentation/perf-$cmd.txt" +done +echo "#endif /* HAVE_LIBTRACEEVENT */" echo "};" -- cgit From 92d43bd3bc9728c1fb114d7011d46f5ea9489e28 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 8 Nov 2022 15:14:20 +0100 Subject: drm/imx: ipuv3-plane: Fix overlay plane width ipu_src_rect_width() was introduced to support odd screen resolutions such as 1366x768 by internally rounding up primary plane width to a multiple of 8 and compensating with reduced horizontal blanking. This also caused overlay plane width to be rounded up, which was not intended. Fix overlay plane width by limiting the rounding up to the primary plane. drm_rect_width(&new_state->src) >> 16 is the same value as drm_rect_width(dst) because there is no plane scaling support. Fixes: 94dfec48fca7 ("drm/imx: Add 8 pixel alignment fix") Reviewed-by: Lucas Stach Link: https://lore.kernel.org/r/20221108141420.176696-1-p.zabel@pengutronix.de Signed-off-by: Philipp Zabel Link: https://patchwork.freedesktop.org/patch/msgid/20221108141420.176696-1-p.zabel@pengutronix.de Tested-by: Ian Ray (cherry picked from commit 4333472f8d7befe62359fecb1083cd57a6e07bfc) Signed-off-by: Philipp Zabel --- drivers/gpu/drm/imx/ipuv3-plane.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c index dba4f7d81d69..80142d9a4a55 100644 --- a/drivers/gpu/drm/imx/ipuv3-plane.c +++ b/drivers/gpu/drm/imx/ipuv3-plane.c @@ -614,6 +614,11 @@ static void ipu_plane_atomic_update(struct drm_plane *plane, break; } + if (ipu_plane->dp_flow == IPU_DP_FLOW_SYNC_BG) + width = ipu_src_rect_width(new_state); + else + width = drm_rect_width(&new_state->src) >> 16; + eba = drm_plane_state_to_eba(new_state, 0); /* @@ -622,8 +627,7 @@ static void ipu_plane_atomic_update(struct drm_plane *plane, */ if (ipu_state->use_pre) { axi_id = ipu_chan_assign_axi_id(ipu_plane->dma); - ipu_prg_channel_configure(ipu_plane->ipu_ch, axi_id, - ipu_src_rect_width(new_state), + ipu_prg_channel_configure(ipu_plane->ipu_ch, axi_id, width, drm_rect_height(&new_state->src) >> 16, fb->pitches[0], fb->format->format, fb->modifier, &eba); @@ -678,9 +682,8 @@ static void ipu_plane_atomic_update(struct drm_plane *plane, break; } - ipu_dmfc_config_wait4eot(ipu_plane->dmfc, ALIGN(drm_rect_width(dst), 8)); + ipu_dmfc_config_wait4eot(ipu_plane->dmfc, width); - width = ipu_src_rect_width(new_state); height = drm_rect_height(&new_state->src) >> 16; info = drm_format_info(fb->format->format); ipu_calculate_bursts(width, info->cpp[0], fb->pitches[0], @@ -744,8 +747,7 @@ static void ipu_plane_atomic_update(struct drm_plane *plane, ipu_cpmem_set_burstsize(ipu_plane->ipu_ch, 16); ipu_cpmem_zero(ipu_plane->alpha_ch); - ipu_cpmem_set_resolution(ipu_plane->alpha_ch, - ipu_src_rect_width(new_state), + ipu_cpmem_set_resolution(ipu_plane->alpha_ch, width, drm_rect_height(&new_state->src) >> 16); ipu_cpmem_set_format_passthrough(ipu_plane->alpha_ch, 8); ipu_cpmem_set_high_priority(ipu_plane->alpha_ch); -- cgit From f24fb53984cfba42ff72a47466eabfd772da647b Mon Sep 17 00:00:00 2001 From: Ahelenia Ziemiańska Date: Tue, 27 Dec 2022 21:58:00 +0100 Subject: perf tools: Don't include signature in version strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This explodes the build if HEAD is signed, since the generated version is gpg: Signature made Mon 26 Dec 2022 20:34:48 CET, then a few more lines, then the SHA. Signed-off-by: Ahelenia Ziemiańska Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/7c9637711271f50ec2341fb8a7c29585335dab04.1672174189.git.nabijaczleweli@nabijaczleweli.xyz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Makefile | 2 +- tools/perf/util/PERF-VERSION-GEN | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 6e7b88917ca0..ba5d942e4c6a 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -267,7 +267,7 @@ $(OUTPUT)%.xml : %.txt $(ASCIIDOC) -b docbook -d manpage \ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ -aperf_date=$(shell git log -1 --pretty="format:%cd" \ - --date=short $<) \ + --date=short --no-show-signature $<) \ -o $@+ $< && \ mv $@+ $@ diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN index 3cc42821d9b3..d7dc7c28508c 100755 --- a/tools/perf/util/PERF-VERSION-GEN +++ b/tools/perf/util/PERF-VERSION-GEN @@ -19,7 +19,7 @@ TAG= if test -d ../../.git -o -f ../../.git then TAG=$(MAKEFLAGS= make -sC ../.. kernelversion) - CID=$(git log -1 --abbrev=12 --pretty=format:"%h" 2>/dev/null) && CID="-g$CID" + CID=$(git log -1 --abbrev=12 --pretty=format:"%h" --no-show-signature 2>/dev/null) && CID="-g$CID" elif test -f ../../PERF-VERSION-FILE then TAG=$(cut -d' ' -f3 ../../PERF-VERSION-FILE | sed -e 's/\"//g') -- cgit From cad853374d85fe678d721512cecfabd7636e51f3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Dec 2022 13:08:26 -0500 Subject: nfsd: fix handling of readdir in v4root vs. mount upcall timeout If v4 READDIR operation hits a mountpoint and gets back an error, then it will include that entry in the reply and set RDATTR_ERROR for it to the error. That's fine for "normal" exported filesystems, but on the v4root, we need to be more careful to only expose the existence of dentries that lead to exports. If the mountd upcall times out while checking to see whether a mountpoint on the v4root is exported, then we have no recourse other than to fail the whole operation. Cc: Steve Dickson Link: https://bugzilla.kernel.org/show_bug.cgi?id=216777 Reported-by: JianHong Yin Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever Cc: --- fs/nfsd/nfs4xdr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2b4ae858c89b..ebb4d02a42ce 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3629,6 +3629,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, case nfserr_noent: xdr_truncate_encode(xdr, start_offset); goto skip_entry; + case nfserr_jukebox: + /* + * The pseudoroot should only display dentries that lead to + * exports. If we get EJUKEBOX here, then we can't tell whether + * this entry should be included. Just fail the whole READDIR + * with NFS4ERR_DELAY in that case, and hope that the situation + * will resolve itself by the client's next attempt. + */ + if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT) + goto fail; + fallthrough; default: /* * If the client requested the RDATTR_ERROR attribute, -- cgit From 77fe30fed1c0cd282eadb0728999ea3d3350e0b0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 2 Jan 2023 13:06:45 -0300 Subject: perf tools: Fix segfault when trying to process tracepoints in perf.data and not linked with libtraceevent When we have a perf.data file with tracepoints, such as: # perf evlist -f probe_perf:lzma_decompress_to_file # Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events # We end up segfaulting when using perf built with NO_LIBTRACEEVENT=1 by trying to find an evsel with a NULL 'event_name' variable: (gdb) run report --stdio -f Starting program: /root/bin/perf report --stdio -f Program received signal SIGSEGV, Segmentation fault. 0x000000000055219d in find_evsel (evlist=0xfda7b0, event_name=0x0) at util/sort.c:2830 warning: Source file is more recent than executable. 2830 if (event_name[0] == '%') { Missing separate debuginfos, use: dnf debuginfo-install bzip2-libs-1.0.8-11.fc36.x86_64 cyrus-sasl-lib-2.1.27-18.fc36.x86_64 elfutils-debuginfod-client-0.188-3.fc36.x86_64 elfutils-libelf-0.188-3.fc36.x86_64 elfutils-libs-0.188-3.fc36.x86_64 glibc-2.35-20.fc36.x86_64 keyutils-libs-1.6.1-4.fc36.x86_64 krb5-libs-1.19.2-12.fc36.x86_64 libbrotli-1.0.9-7.fc36.x86_64 libcap-2.48-4.fc36.x86_64 libcom_err-1.46.5-2.fc36.x86_64 libcurl-7.82.0-12.fc36.x86_64 libevent-2.1.12-6.fc36.x86_64 libgcc-12.2.1-4.fc36.x86_64 libidn2-2.3.4-1.fc36.x86_64 libnghttp2-1.51.0-1.fc36.x86_64 libpsl-0.21.1-5.fc36.x86_64 libselinux-3.3-4.fc36.x86_64 libssh-0.9.6-4.fc36.x86_64 libstdc++-12.2.1-4.fc36.x86_64 libunistring-1.0-1.fc36.x86_64 libunwind-1.6.2-2.fc36.x86_64 libxcrypt-4.4.33-4.fc36.x86_64 libzstd-1.5.2-2.fc36.x86_64 numactl-libs-2.0.14-5.fc36.x86_64 opencsd-1.2.0-1.fc36.x86_64 openldap-2.6.3-1.fc36.x86_64 openssl-libs-3.0.5-2.fc36.x86_64 slang-2.3.2-11.fc36.x86_64 xz-libs-5.2.5-9.fc36.x86_64 zlib-1.2.11-33.fc36.x86_64 (gdb) bt #0 0x000000000055219d in find_evsel (evlist=0xfda7b0, event_name=0x0) at util/sort.c:2830 #1 0x0000000000552416 in add_dynamic_entry (evlist=0xfda7b0, tok=0xffb6eb "trace", level=2) at util/sort.c:2976 #2 0x0000000000552d26 in sort_dimension__add (list=0xf93e00 , tok=0xffb6eb "trace", evlist=0xfda7b0, level=2) at util/sort.c:3193 #3 0x0000000000552e1c in setup_sort_list (list=0xf93e00 , str=0xffb6eb "trace", evlist=0xfda7b0) at util/sort.c:3227 #4 0x00000000005532fa in __setup_sorting (evlist=0xfda7b0) at util/sort.c:3381 #5 0x0000000000553cdc in setup_sorting (evlist=0xfda7b0) at util/sort.c:3608 #6 0x000000000042eb9f in cmd_report (argc=0, argv=0x7fffffffe470) at builtin-report.c:1596 #7 0x00000000004aee7e in run_builtin (p=0xf64ca0 , argc=3, argv=0x7fffffffe470) at perf.c:330 #8 0x00000000004af0f2 in handle_internal_command (argc=3, argv=0x7fffffffe470) at perf.c:384 #9 0x00000000004af241 in run_argv (argcp=0x7fffffffe29c, argv=0x7fffffffe290) at perf.c:428 #10 0x00000000004af5fc in main (argc=3, argv=0x7fffffffe470) at perf.c:562 (gdb) So check if we have tracepoint events in add_dynamic_entry() and bail out instead: # perf report --stdio -f This perf binary isn't linked with libtraceevent, can't process probe_perf:lzma_decompress_to_file Error: Unknown --sort key: `trace' # Fixes: 378ef0f5d9d7f465 ("perf build: Use libtraceevent from the system") Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/Y7MDb7kRaHZB6APC@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/sort.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index e188f74698dd..37662cdec5ee 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2971,6 +2971,18 @@ static int add_dynamic_entry(struct evlist *evlist, const char *tok, ret = add_all_matching_fields(evlist, field_name, raw_trace, level); goto out; } +#else + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT) { + pr_err("%s %s", ret ? "," : "This perf binary isn't linked with libtraceevent, can't process", evsel__name(evsel)); + ret = -ENOTSUP; + } + } + + if (ret) { + pr_err("\n"); + goto out; + } #endif evsel = find_evsel(evlist, event_name); -- cgit From d00dd2f2645dca04cf399d8fc692f3f69b6dd996 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 22 Nov 2022 12:51:22 +0100 Subject: x86/kexec: Fix double-free of elf header buffer After b3e34a47f989 ("x86/kexec: fix memory leak of elf header buffer"), freeing image->elf_headers in the error path of crash_load_segments() is not needed because kimage_file_post_load_cleanup() will take care of that later. And not clearing it could result in a double-free. Drop the superfluous vfree() call at the error path of crash_load_segments(). Fixes: b3e34a47f989 ("x86/kexec: fix memory leak of elf header buffer") Signed-off-by: Takashi Iwai Signed-off-by: Borislav Petkov (AMD) Acked-by: Baoquan He Acked-by: Vlastimil Babka Cc: Link: https://lore.kernel.org/r/20221122115122.13937-1-tiwai@suse.de --- arch/x86/kernel/crash.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 9730c88530fc..305514431f26 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -401,10 +401,8 @@ int crash_load_segments(struct kimage *image) kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); - if (ret) { - vfree((void *)image->elf_headers); + if (ret) return ret; - } image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); -- cgit From 0226635c304cfd5c9db9b78c259cb713819b057e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 2 Jan 2023 23:05:33 +0900 Subject: fs/ntfs3: don't hold ni_lock when calling truncate_setsize() syzbot is reporting hung task at do_user_addr_fault() [1], for there is a silent deadlock between PG_locked bit and ni_lock lock. Since filemap_update_page() calls filemap_read_folio() after calling folio_trylock() which will set PG_locked bit, ntfs_truncate() must not call truncate_setsize() which will wait for PG_locked bit to be cleared when holding ni_lock lock. Link: https://lore.kernel.org/all/00000000000060d41f05f139aa44@google.com/ Link: https://syzkaller.appspot.com/bug?extid=bed15dbf10294aa4f2ae [1] Reported-by: syzbot Debugged-by: Linus Torvalds Co-developed-by: Hillf Danton Signed-off-by: Hillf Danton Signed-off-by: Tetsuo Handa Fixes: 4342306f0f0d ("fs/ntfs3: Add file operations and implementation") Signed-off-by: Linus Torvalds --- fs/ntfs3/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index e5399ebc3a2b..d294cd975688 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -390,10 +390,10 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size)); - ni_lock(ni); - truncate_setsize(inode, new_size); + ni_lock(ni); + down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, &new_valid, ni->mi.sbi->options->prealloc, NULL); -- cgit From af82425c6a2d2f347c79b63ce74fca6dc6be157f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Jan 2023 16:49:46 -0700 Subject: io_uring/io-wq: free worker if task_work creation is canceled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we cancel the task_work, the worker will never come into existance. As this is the last reference to it, ensure that we get it freed appropriately. Cc: stable@vger.kernel.org Reported-by: 진호 Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 6f1d0e5df23a..992dcd9f8c4c 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1230,6 +1230,7 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) worker = container_of(cb, struct io_worker, create_work); io_worker_cancel_cb(worker); + kfree(worker); } } -- cgit From 9c807965483f42df1d053b7436eedd6cf28ece6f Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Mon, 2 Jan 2023 12:53:35 +0300 Subject: drivers/net/bonding/bond_3ad: return when there's no aggregator Otherwise we would dereference a NULL aggregator pointer when calling __set_agg_ports_ready on the line below. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daniil Tatianin Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 455b555275f1..c99ffe6c683a 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1549,6 +1549,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", port->actor_port_number); + return; } } /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE -- cgit From 4af1b64f80fbe1275fb02c5f1c0cef099a4a231f Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Tue, 3 Jan 2023 09:20:12 +0530 Subject: octeontx2-pf: Fix lmtst ID used in aura free Current code uses per_cpu pointer to get the lmtst_id mapped to the core on which aura_free() is executed. Using per_cpu pointer without preemption disable causing mismatch between lmtst_id and core on which pointer gets freed. This patch fixes the issue by disabling preemption around aura_free. Fixes: ef6c8da71eaf ("octeontx2-pf: cn10K: Reserve LMTST lines per core") Signed-off-by: Sunil Goutham Signed-off-by: Geetha sowjanya Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/otx2_common.c | 30 +++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 9e10e7471b88..88f8772a61cd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1012,6 +1012,7 @@ static void otx2_pool_refill_task(struct work_struct *work) rbpool = cq->rbpool; free_ptrs = cq->pool_ptrs; + get_cpu(); while (cq->pool_ptrs) { if (otx2_alloc_rbuf(pfvf, rbpool, &bufptr)) { /* Schedule a WQ if we fails to free atleast half of the @@ -1031,6 +1032,7 @@ static void otx2_pool_refill_task(struct work_struct *work) pfvf->hw_ops->aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM); cq->pool_ptrs--; } + put_cpu(); cq->refill_task_sched = false; } @@ -1368,6 +1370,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) if (err) goto fail; + get_cpu(); /* Allocate pointers and free them to aura/pool */ for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) { pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx); @@ -1376,18 +1379,24 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) sq = &qset->sq[qidx]; sq->sqb_count = 0; sq->sqb_ptrs = kcalloc(num_sqbs, sizeof(*sq->sqb_ptrs), GFP_KERNEL); - if (!sq->sqb_ptrs) - return -ENOMEM; + if (!sq->sqb_ptrs) { + err = -ENOMEM; + goto err_mem; + } for (ptr = 0; ptr < num_sqbs; ptr++) { - if (otx2_alloc_rbuf(pfvf, pool, &bufptr)) - return -ENOMEM; + err = otx2_alloc_rbuf(pfvf, pool, &bufptr); + if (err) + goto err_mem; pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr); sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr; } } - return 0; +err_mem: + put_cpu(); + return err ? -ENOMEM : 0; + fail: otx2_mbox_reset(&pfvf->mbox.mbox, 0); otx2_aura_pool_free(pfvf); @@ -1426,18 +1435,21 @@ int otx2_rq_aura_pool_init(struct otx2_nic *pfvf) if (err) goto fail; + get_cpu(); /* Allocate pointers and free them to aura/pool */ for (pool_id = 0; pool_id < hw->rqpool_cnt; pool_id++) { pool = &pfvf->qset.pool[pool_id]; for (ptr = 0; ptr < num_ptrs; ptr++) { - if (otx2_alloc_rbuf(pfvf, pool, &bufptr)) - return -ENOMEM; + err = otx2_alloc_rbuf(pfvf, pool, &bufptr); + if (err) + goto err_mem; pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr + OTX2_HEAD_ROOM); } } - - return 0; +err_mem: + put_cpu(); + return err ? -ENOMEM : 0; fail: otx2_mbox_reset(&pfvf->mbox.mbox, 0); otx2_aura_pool_free(pfvf); -- cgit From 7dc61838541928895abae6d2355258e02a251bba Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 3 Jan 2023 01:50:38 -0500 Subject: net: dpaa: Fix dtsec check for PCS availability We want to fail if the PCS is not available, not if it is available. Fix this condition. Fixes: 5d93cfcf7360 ("net: dpaa: Convert to phylink") Reported-by: Christian Zigotzky Signed-off-by: Sean Anderson Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fman/fman_dtsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fman/fman_dtsec.c b/drivers/net/ethernet/freescale/fman/fman_dtsec.c index d00bae15a901..d528ca681b6f 100644 --- a/drivers/net/ethernet/freescale/fman/fman_dtsec.c +++ b/drivers/net/ethernet/freescale/fman/fman_dtsec.c @@ -1430,7 +1430,7 @@ int dtsec_initialization(struct mac_device *mac_dev, dtsec->dtsec_drv_param->tx_pad_crc = true; phy_node = of_parse_phandle(mac_node, "tbi-handle", 0); - if (!phy_node || of_device_is_available(phy_node)) { + if (!phy_node || !of_device_is_available(phy_node)) { of_node_put(phy_node); err = -EINVAL; dev_err_probe(mac_dev->dev, err, -- cgit From c7dd13805f8b8fc1ce3b6d40f6aff47e66b72ad2 Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Tue, 3 Jan 2023 10:17:09 +0100 Subject: usb: rndis_host: Secure rndis_query check against int overflow Variables off and len typed as uint32 in rndis_query function are controlled by incoming RNDIS response message thus their value may be manipulated. Setting off to a unexpectetly large value will cause the sum with len and 8 to overflow and pass the implemented validation step. Consequently the response pointer will be referring to a location past the expected buffer boundaries allowing information leakage e.g. via RNDIS_OID_802_3_PERMANENT_ADDRESS OID. Fixes: ddda08624013 ("USB: rndis_host, various cleanups") Signed-off-by: Szymon Heidrich Signed-off-by: David S. Miller --- drivers/net/usb/rndis_host.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c index f79333fe1783..7b3739b29c8f 100644 --- a/drivers/net/usb/rndis_host.c +++ b/drivers/net/usb/rndis_host.c @@ -255,7 +255,8 @@ static int rndis_query(struct usbnet *dev, struct usb_interface *intf, off = le32_to_cpu(u.get_c->offset); len = le32_to_cpu(u.get_c->len); - if (unlikely((8 + off + len) > CONTROL_BUFFER_SIZE)) + if (unlikely((off > CONTROL_BUFFER_SIZE - 8) || + (len > CONTROL_BUFFER_SIZE - 8 - off))) goto response_error; if (*reply_len != -1 && len != *reply_len) -- cgit From 406504c7b0405d74d74c15a667cd4c4620c3e7a9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 20 Dec 2022 14:03:52 +0000 Subject: KVM: arm64: Fix S1PTW handling on RO memslots A recent development on the EFI front has resulted in guests having their page tables baked in the firmware binary, and mapped into the IPA space as part of a read-only memslot. Not only is this legitimate, but it also results in added security, so thumbs up. It is possible to take an S1PTW translation fault if the S1 PTs are unmapped at stage-2. However, KVM unconditionally treats S1PTW as a write to correctly handle hardware AF/DB updates to the S1 PTs. Furthermore, KVM injects an exception into the guest for S1PTW writes. In the aforementioned case this results in the guest taking an abort it won't recover from, as the S1 PTs mapping the vectors suffer from the same problem. So clearly our handling is... wrong. Instead, switch to a two-pronged approach: - On S1PTW translation fault, handle the fault as a read - On S1PTW permission fault, handle the fault as a write This is of no consequence to SW that *writes* to its PTs (the write will trigger a non-S1PTW fault), and SW that uses RO PTs will not use HW-assisted AF/DB anyway, as that'd be wrong. Only in the case described in c4ad98e4b72c ("KVM: arm64: Assume write fault on S1PTW permission fault on instruction fetch") do we end-up with two back-to-back faults (page being evicted and faulted back). I don't think this is a case worth optimising for. Fixes: c4ad98e4b72c ("KVM: arm64: Assume write fault on S1PTW permission fault on instruction fetch") Reviewed-by: Oliver Upton Reviewed-by: Ard Biesheuvel Regression-tested-by: Ard Biesheuvel Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org --- arch/arm64/include/asm/kvm_emulate.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 9bdba47f7e14..0d40c48d8132 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -373,8 +373,26 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_abt_iss1tw(vcpu)) - return true; + if (kvm_vcpu_abt_iss1tw(vcpu)) { + /* + * Only a permission fault on a S1PTW should be + * considered as a write. Otherwise, page tables baked + * in a read-only memslot will result in an exception + * being delivered in the guest. + * + * The drawback is that we end-up faulting twice if the + * guest is using any of HW AF/DB: a translation fault + * to map the page containing the PT (read only at + * first), then a permission fault to allow the flags + * to be set. + */ + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { + case ESR_ELx_FSC_PERM: + return true; + default: + return false; + } + } if (kvm_vcpu_trap_is_iabt(vcpu)) return false; -- cgit From b8f8d190fa8fa1909dda12d771df67125d6fbf0c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 22 Dec 2022 09:26:31 +0000 Subject: KVM: arm64: Document the behaviour of S1PTW faults on RO memslots Although the KVM API says that a write to a RO memslot must result in a KVM_EXIT_MMIO describing the write, the arm64 architecture doesn't provide the *data* written by a Stage-1 page table walk (we only get the address). Since there isn't much userspace can do with so little information anyway, document the fact that such an access results in a guest exception, not an exit. This is consistent with the guest being terminally broken anyway. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/api.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0dd5d8733dd5..42db72a0cbe6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1354,6 +1354,14 @@ the memory region are automatically reflected into the guest. For example, an mmap() that affects the region will be made visible immediately. Another example is madvise(MADV_DROP). +Note: On arm64, a write generated by the page-table walker (to update +the Access and Dirty flags, for example) never results in a +KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This +is because KVM cannot provide the data that would be written by the +page-table walker, making it impossible to emulate the access. +Instead, an abort (data abort if the cause of the page-table update +was a load or a store, instruction abort if it was an instruction +fetch) is injected in the guest. 4.36 KVM_SET_TSS_ADDR --------------------- -- cgit From b0803ba72b558957fdcfe845939ee788b7ce5919 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 20 Dec 2022 14:49:30 +0000 Subject: KVM: arm64: Convert FSC_* over to ESR_ELx_FSC_* The former is an AArch32 legacy, so let's move over to the verbose (and strictly identical) version. This involves moving some of the #defines that were private to KVM into the more generic esr.h. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/esr.h | 9 +++++++++ arch/arm64/include/asm/kvm_arm.h | 15 --------------- arch/arm64/include/asm/kvm_emulate.h | 20 ++++++++++---------- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- arch/arm64/kvm/mmu.c | 21 ++++++++++++--------- 6 files changed, 33 insertions(+), 36 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 15b34fbfca66..206de10524e3 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -114,6 +114,15 @@ #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) +#define ESR_ELx_FSC_SEA_TTW0 (0x14) +#define ESR_ELx_FSC_SEA_TTW1 (0x15) +#define ESR_ELx_FSC_SEA_TTW2 (0x16) +#define ESR_ELx_FSC_SEA_TTW3 (0x17) +#define ESR_ELx_FSC_SECC (0x18) +#define ESR_ELx_FSC_SECC_TTW0 (0x1c) +#define ESR_ELx_FSC_SECC_TTW1 (0x1d) +#define ESR_ELx_FSC_SECC_TTW2 (0x1e) +#define ESR_ELx_FSC_SECC_TTW3 (0x1f) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 0df3fc3a0173..26b0c97df986 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -319,21 +319,6 @@ BIT(18) | \ GENMASK(16, 15)) -/* For compatibility with fault code shared with 32-bit */ -#define FSC_FAULT ESR_ELx_FSC_FAULT -#define FSC_ACCESS ESR_ELx_FSC_ACCESS -#define FSC_PERM ESR_ELx_FSC_PERM -#define FSC_SEA ESR_ELx_FSC_EXTABT -#define FSC_SEA_TTW0 (0x14) -#define FSC_SEA_TTW1 (0x15) -#define FSC_SEA_TTW2 (0x16) -#define FSC_SEA_TTW3 (0x17) -#define FSC_SECC (0x18) -#define FSC_SECC_TTW0 (0x1c) -#define FSC_SECC_TTW1 (0x1d) -#define FSC_SECC_TTW2 (0x1e) -#define FSC_SECC_TTW3 (0x1f) - /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) /* diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0d40c48d8132..193583df2d9c 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -349,16 +349,16 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *v static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) { switch (kvm_vcpu_trap_get_fault(vcpu)) { - case FSC_SEA: - case FSC_SEA_TTW0: - case FSC_SEA_TTW1: - case FSC_SEA_TTW2: - case FSC_SEA_TTW3: - case FSC_SECC: - case FSC_SECC_TTW0: - case FSC_SECC_TTW1: - case FSC_SECC_TTW2: - case FSC_SECC_TTW3: + case ESR_ELx_FSC_EXTABT: + case ESR_ELx_FSC_SEA_TTW0: + case ESR_ELx_FSC_SEA_TTW1: + case ESR_ELx_FSC_SEA_TTW2: + case ESR_ELx_FSC_SEA_TTW3: + case ESR_ELx_FSC_SECC: + case ESR_ELx_FSC_SECC_TTW0: + case ESR_ELx_FSC_SECC_TTW1: + case ESR_ELx_FSC_SECC_TTW2: + case ESR_ELx_FSC_SECC_TTW3: return true; default: return false; diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 1b8a2dcd712f..9ddcfe2c3e57 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) */ if (!(esr & ESR_ELx_S1PTW) && (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 3330d1b76bdd..07d37ff88a3f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -367,7 +367,7 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; - valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && kvm_vcpu_dabt_isvalid(vcpu) && !kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 31d7fa4c7c14..a3ee3b605c9b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1212,7 +1212,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_status == FSC_PERM && !write_fault && !exec_fault) { + if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1277,7 +1277,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (fault_status != FSC_PERM || (logging_active && write_fault)) { + if (fault_status != ESR_ELx_FSC_PERM || + (logging_active && write_fault)) { ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); if (ret) @@ -1342,7 +1343,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { - if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) + if (fault_status == ESR_ELx_FSC_PERM && + fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, @@ -1350,7 +1352,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, &fault_ipa); } - if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { + if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (kvm_vma_mte_allowed(vma)) { sanitise_mte_tags(kvm, pfn, vma_pagesize); @@ -1376,7 +1378,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == FSC_PERM && vma_pagesize == fault_granule) + if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, @@ -1441,7 +1443,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (fault_status == FSC_FAULT) { + if (fault_status == ESR_ELx_FSC_FAULT) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1476,8 +1478,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && - fault_status != FSC_ACCESS) { + if (fault_status != ESR_ELx_FSC_FAULT && + fault_status != ESR_ELx_FSC_PERM && + fault_status != ESR_ELx_FSC_ACCESS) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1539,7 +1542,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) /* Userspace should not be able to register out-of-bounds IPAs */ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); - if (fault_status == FSC_ACCESS) { + if (fault_status == ESR_ELx_FSC_ACCESS) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; -- cgit From 69555549cfa42e10f2fdd2699ed4e34d9d4f392b Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 23 Nov 2022 03:13:03 +0300 Subject: drm/scheduler: Fix lockup in drm_sched_entity_kill() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drm_sched_entity_kill() is invoked twice by drm_sched_entity_destroy() while userspace process is exiting or being killed. First time it's invoked when sched entity is flushed and second time when entity is released. This causes a lockup within wait_for_completion(entity_idle) due to how completion API works. Calling wait_for_completion() more times than complete() was invoked is a error condition that causes lockup because completion internally uses counter for complete/wait calls. The complete_all() must be used instead in such cases. This patch fixes lockup of Panfrost driver that is reproducible by killing any application in a middle of 3d drawing operation. Fixes: 2fdb8a8f07c2 ("drm/scheduler: rework entity flush, kill and fini") Signed-off-by: Dmitry Osipenko Reviewed-by: Christian König Tested-by: Guilherme G. Piccoli # Steam Deck Link: https://patchwork.freedesktop.org/patch/msgid/20221123001303.533968-1-dmitry.osipenko@collabora.com --- drivers/gpu/drm/scheduler/sched_entity.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index fe09e5be79bd..15d04a0ec623 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -81,7 +81,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, init_completion(&entity->entity_idle); /* We start in an idle state. */ - complete(&entity->entity_idle); + complete_all(&entity->entity_idle); spin_lock_init(&entity->rq_lock); spsc_queue_init(&entity->job_queue); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 31f3a1267be4..fd22d753b4ed 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -987,7 +987,7 @@ static int drm_sched_main(void *param) sched_job = drm_sched_entity_pop_job(entity); if (!sched_job) { - complete(&entity->entity_idle); + complete_all(&entity->entity_idle); continue; } @@ -998,7 +998,7 @@ static int drm_sched_main(void *param) trace_drm_run_job(sched_job, entity); fence = sched->ops->run_job(sched_job); - complete(&entity->entity_idle); + complete_all(&entity->entity_idle); drm_sched_fence_scheduled(s_fence); if (!IS_ERR_OR_NULL(fence)) { -- cgit From b963c1d6261eb7fba1ba14074fb447521be84add Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Jan 2023 10:21:01 -0300 Subject: perf test record_probe_libc_inet_pton: Fix failure due to extra inet_pton() backtrace in glibc >= 2.35 Starting with glibc 2.35 there are extra inet_pton() calls when doing a IPv6 ping as in one of the 'perf test' entry, which makes it fail: # perf test inet_pton 89: probe libc's inet_pton & backtrace it with ping : FAILED! # If we look at what this script is expecting (commenting out the removal of the temporary files in it): # cat /tmp/expected.aT6 ping[][0-9 \.:]+probe_libc:inet_pton: \([[:xdigit:]]+\) .*inet_pton\+0x[[:xdigit:]]+[[:space:]]\(/usr/lib64/libc.so.6|inlined\)$ getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\(/usr/lib64/libc.so.6\)$ .*(\+0x[[:xdigit:]]+|\[unknown\])[[:space:]]\(.*/bin/ping.*\)$ # And looking at what we are getting out of 'perf script', to match with the above: # cat /tmp/perf.script.IUC ping 623883 [006] 265438.471610: probe_libc:inet_pton: (7f32bcf314c0) 1314c0 __GI___inet_pton+0x0 (/usr/lib64/libc.so.6) 29510 __libc_start_call_main+0x80 (/usr/lib64/libc.so.6) ping 623883 [006] 265438.471664: probe_libc:inet_pton: (7f32bcf314c0) 1314c0 __GI___inet_pton+0x0 (/usr/lib64/libc.so.6) fa6c6 getaddrinfo+0x126 (/usr/lib64/libc.so.6) 491e [unknown] (/usr/bin/ping) # We see that its just the first call to inet_pton() that didn't came thru getaddrinfo(), so if we ignore the first the script matches what it expects, testing that using 'perf probe' + 'perf record' + 'perf script' with callchains on userspace targets is producing the expected results. Since we don't have a 'perf script --skip' to help us here, use tac + grep to do that, resulting in a one liner that makes this script work on both older glibc versions as well as with 2.35. With it, on fedora 36, x86, glibc 2.35: # perf test inet_pton 90: probe libc's inet_pton & backtrace it with ping : Ok # perf test -v inet_pton 90: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 627197 ping 627220 1 267956.962402: probe_libc:inet_pton_1: (7f488bf314c0) 1314c0 __GI___inet_pton+0x0 (/usr/lib64/libc.so.6) fa6c6 getaddrinfo+0x126 (/usr/lib64/libc.so.6) 491e n (/usr/bin/ping) test child finished with 0 ---- end ---- probe libc's inet_pton & backtrace it with ping: Ok # And on Ubuntu 22.04.1 LTS on a Libre Computer ROC-RK3399-PC arm64 system: Before this patch it works (see that the script used has no 'tac' to remove the first event): root@roc-rk3399-pc:~# dpkg -l | grep libc-bin ii libc-bin 2.35-0ubuntu3.1 arm64 GNU C Library: Binaries root@roc-rk3399-pc:~# grep -w tac ~acme/libexec/perf-core/tests/shell/record+probe_libc_inet_pton.sh root@roc-rk3399-pc:~# perf test inet_pton 86: probe libc's inet_pton & backtrace it with ping : Ok root@roc-rk3399-pc:~# perf test -v inet_pton 86: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 1375 ping 1399 [000] 4114.417450: probe_libc:inet_pton: (ffffb3e26120) 106120 inet_pton+0x0 (/usr/lib/aarch64-linux-gnu/libc.so.6) d18bc getaddrinfo+0xec (/usr/lib/aarch64-linux-gnu/libc.so.6) 2b68 [unknown] (/usr/bin/ping) test child finished with 0 ---- end ---- probe libc's inet_pton & backtrace it with ping: Ok root@roc-rk3399-pc:~# And after it continues to work: root@roc-rk3399-pc:~# grep -w tac ~acme/libexec/perf-core/tests/shell/record+probe_libc_inet_pton.sh perf script -i $perf_data | tac | grep -m1 ^ping -B9 | tac > $perf_script root@roc-rk3399-pc:~# perf test inet_pton 86: probe libc's inet_pton & backtrace it with ping : Ok root@roc-rk3399-pc:~# perf test -v inet_pton 86: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 6995 ping 7019 [005] 4832.160741: probe_libc:inet_pton: (ffffa62e6120) 106120 inet_pton+0x0 (/usr/lib/aarch64-linux-gnu/libc.so.6) d18bc getaddrinfo+0xec (/usr/lib/aarch64-linux-gnu/libc.so.6) 2b68 [unknown] (/usr/bin/ping) test child finished with 0 ---- end ---- probe libc's inet_pton & backtrace it with ping: Ok root@roc-rk3399-pc:~# Reported-by: Thomas Richter Cc: Adrian Hunter Cc: Heiko Carstens Cc: Ian Rogers Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/Y7QyPkPlDYip3cZH@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index 34c400ccbe04..216b6b64caa3 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -57,7 +57,7 @@ trace_libc_inet_pton_backtrace() { perf_data=`mktemp -u /tmp/perf.data.XXX` perf_script=`mktemp -u /tmp/perf.script.XXX` perf record -e $event_name/$eventattr/ -o $perf_data ping -6 -c 1 ::1 > /dev/null 2>&1 - perf script -i $perf_data > $perf_script + perf script -i $perf_data | tac | grep -m1 ^ping -B9 | tac > $perf_script exec 3<$perf_script exec 4<$expected -- cgit From 946c2923e76327343e4460e8bb7ec7b4d4589397 Mon Sep 17 00:00:00 2001 From: Tanmay Bhushan <007047221b@gmail.com> Date: Sat, 31 Dec 2022 16:05:01 +0100 Subject: btrfs: fix ASSERT em->len condition in btrfs_get_extent The em->len value is supposed to be verified in the assertion condition though we expect it to be same as the sectorsize. Fixes: a196a8944f77 ("btrfs: do not reset extent map members for inline extents read") Reviewed-by: Anand Jain Signed-off-by: Tanmay Bhushan <007047221b@gmail.com> Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bfcbe64eb8b3..940b404c8f28 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7092,7 +7092,7 @@ next: * Other members are not utilized for inline extents. */ ASSERT(em->block_start == EXTENT_MAP_INLINE); - ASSERT(em->len = fs_info->sectorsize); + ASSERT(em->len == fs_info->sectorsize); ret = read_inline_extent(inode, path, page); if (ret < 0) -- cgit From 77177ed17d24ba060117bdb6efb8a01da7531676 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Dec 2022 07:32:23 +0800 Subject: btrfs: add error message for metadata level mismatch From a recent regression report, we found that after commit 947a629988f1 ("btrfs: move tree block parentness check into validate_extent_buffer()") if we have a level mismatch (false alert though), there is no error message at all. This makes later debugging harder. This patch will add the proper error message for such case. Link: https://lore.kernel.org/linux-btrfs/CABXGCsNzVxo4iq-tJSGm_kO1UggHXgq6CdcHDL=z5FL4njYXSQ@mail.gmail.com/ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0888d484df80..9940cc39dbc9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -530,6 +530,9 @@ static int validate_extent_buffer(struct extent_buffer *eb, } if (found_level != check->level) { + btrfs_err(fs_info, + "level verify failed on logical %llu mirror %u wanted %u found %u", + eb->start, eb->read_mirror, check->level, found_level); ret = -EIO; goto out; } -- cgit From 1d854e4fbabb0cb12ca4a7fcd784eb67a65de5f8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Dec 2022 07:32:24 +0800 Subject: btrfs: fix false alert on bad tree level check [BUG] There is a bug report that on a RAID0 NVMe btrfs system, under heavy write load the filesystem can flip RO randomly. With extra debugging, it shows some tree blocks failed to pass their level checks, and if that happens at critical path of a transaction, we abort the transaction: BTRFS error (device nvme0n1p3): level verify failed on logical 5446121209856 mirror 1 wanted 0 found 1 BTRFS error (device nvme0n1p3: state A): Transaction aborted (error -5) BTRFS: error (device nvme0n1p3: state A) in btrfs_finish_ordered_io:3343: errno=-5 IO failure BTRFS info (device nvme0n1p3: state EA): forced readonly [CAUSE] The reporter has already bisected to commit 947a629988f1 ("btrfs: move tree block parentness check into validate_extent_buffer()"). And with extra debugging, it shows we can have btrfs_tree_parent_check filled with all zeros in the following call trace: submit_one_bio+0xd4/0xe0 submit_extent_page+0x142/0x550 read_extent_buffer_pages+0x584/0x9c0 ? __pfx_end_bio_extent_readpage+0x10/0x10 ? folio_unlock+0x1d/0x50 btrfs_read_extent_buffer+0x98/0x150 read_tree_block+0x43/0xa0 read_block_for_search+0x266/0x370 btrfs_search_slot+0x351/0xd30 ? lock_is_held_type+0xe8/0x140 btrfs_lookup_csum+0x63/0x150 btrfs_csum_file_blocks+0x197/0x6c0 ? sched_clock_cpu+0x9f/0xc0 ? lock_release+0x14b/0x440 ? _raw_read_unlock+0x29/0x50 btrfs_finish_ordered_io+0x441/0x860 btrfs_work_helper+0xfe/0x400 ? lock_is_held_type+0xe8/0x140 process_one_work+0x294/0x5b0 worker_thread+0x4f/0x3a0 ? __pfx_worker_thread+0x10/0x10 kthread+0xf5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 Currently we only copy the btrfs_tree_parent_check structure into bbio at read_extent_buffer_pages() after we have assembled the bbio. But as shown above, submit_extent_page() itself can already submit the bbio, leaving the bbio->parent_check uninitialized, and cause the false alert. [FIX] Instead of copying @check into bbio after bbio is assembled, we pass @check in btrfs_bio_ctrl::parent_check, and copy the content of parent_check in submit_one_bio() for metadata read. By this we should be able to pass the needed info for metadata endio verification, and fix the false alert. Reported-by: Mikhail Gavrilov Link: https://lore.kernel.org/linux-btrfs/CABXGCsNzVxo4iq-tJSGm_kO1UggHXgq6CdcHDL=z5FL4njYXSQ@mail.gmail.com/ Fixes: 947a629988f1 ("btrfs: move tree block parentness check into validate_extent_buffer()") Tested-by: Mikhail Gavrilov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 83dd3aa59663..9bd32daa9b9a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -103,6 +103,15 @@ struct btrfs_bio_ctrl { u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; + /* + * This is for metadata read, to provide the extra needed verification + * info. This has to be provided for submit_one_bio(), as + * submit_one_bio() can submit a bio if it ends at stripe boundary. If + * no such parent_check is provided, the metadata can hit false alert at + * endio time. + */ + struct btrfs_tree_parent_check *parent_check; + /* * Tell writepage not to lock the state bits for this range, it still * does the unlocking. @@ -133,13 +142,24 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - if (!is_data_inode(&inode->vfs_inode)) + if (!is_data_inode(&inode->vfs_inode)) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { + /* + * For metadata read, we should have the parent_check, + * and copy it to bbio for metadata verification. + */ + ASSERT(bio_ctrl->parent_check); + memcpy(&btrfs_bio(bio)->parent_check, + bio_ctrl->parent_check, + sizeof(struct btrfs_tree_parent_check)); + } btrfs_submit_metadata_bio(inode, bio, mirror_num); - else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { btrfs_submit_data_write_bio(inode, bio, mirror_num); - else + } else { btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); + } /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; @@ -4829,6 +4849,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, + .parent_check = check, }; int ret = 0; @@ -4878,7 +4899,6 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) { free_extent_state(cached_state); @@ -4905,6 +4925,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, unsigned long num_reads = 0; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, + .parent_check = check, }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) @@ -4996,7 +5017,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, } } - memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) -- cgit From 2f2e84ca60660402bd81d0859703567c59556e6a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 23 Dec 2022 18:28:53 +0000 Subject: btrfs: fix off-by-one in delalloc search during lseek MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During lseek, when searching for delalloc in a range that represents a hole and that range has a length of 1 byte, we end up not doing the actual delalloc search in the inode's io tree, resulting in not correctly reporting the offset with data or a hole. This actually only happens when the start offset is 0 because with any other start offset we round it down by sector size. Reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt/sdc $ xfs_io -f -c "pwrite -q 0 1" /mnt/sdc/foo $ xfs_io -c "seek -d 0" /mnt/sdc/foo Whence Result DATA EOF It should have reported an offset of 0 instead of EOF. Fix this by updating btrfs_find_delalloc_in_range() and count_range_bits() to deal with inclusive ranges properly. These functions are already supposed to work with inclusive end offsets, they just got it wrong in a couple places due to off-by-one mistakes. A test case for fstests will be added later. Reported-by: Joan Bruguera Micó Link: https://lore.kernel.org/linux-btrfs/20221223020509.457113-1-joanbrugueram@gmail.com/ Fixes: b6e833567ea1 ("btrfs: make hole and data seeking a lot more efficient") CC: stable@vger.kernel.org # 6.1 Tested-by: Joan Bruguera Micó Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/file.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 9ae9cd1e7035..3c7766dfaa69 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1551,7 +1551,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 last = 0; int found = 0; - if (WARN_ON(search_end <= cur_start)) + if (WARN_ON(search_end < cur_start)) return 0; spin_lock(&tree->lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 91b00eb2440e..834bbcb91102 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3354,7 +3354,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool search_io_tree = true; bool ret = false; - while (cur_offset < end) { + while (cur_offset <= end) { u64 delalloc_start; u64 delalloc_end; bool delalloc; -- cgit From d73a27b86fc722c28a26ec64002e3a7dc86d1c07 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 1 Jan 2023 09:02:21 +0800 Subject: btrfs: handle case when repair happens with dev-replace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a bug report that a BUG_ON() in btrfs_repair_io_failure() (originally repair_io_failure() in v6.0 kernel) got triggered when replacing a unreliable disk: BTRFS warning (device sda1): csum failed root 257 ino 2397453 off 39624704 csum 0xb0d18c75 expected csum 0x4dae9c5e mirror 3 kernel BUG at fs/btrfs/extent_io.c:2380! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 9 PID: 3614331 Comm: kworker/u257:2 Tainted: G OE 6.0.0-5-amd64 #1 Debian 6.0.10-2 Hardware name: Micro-Star International Co., Ltd. MS-7C60/TRX40 PRO WIFI (MS-7C60), BIOS 2.70 07/01/2021 Workqueue: btrfs-endio btrfs_end_bio_work [btrfs] RIP: 0010:repair_io_failure+0x24a/0x260 [btrfs] Call Trace: clean_io_failure+0x14d/0x180 [btrfs] end_bio_extent_readpage+0x412/0x6e0 [btrfs] ? __switch_to+0x106/0x420 process_one_work+0x1c7/0x380 worker_thread+0x4d/0x380 ? rescuer_thread+0x3a0/0x3a0 kthread+0xe9/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 [CAUSE] Before the BUG_ON(), we got some read errors from the replace target first, note the mirror number (3, which is beyond RAID1 duplication, thus it's read from the replace target device). Then at the BUG_ON() location, we are trying to writeback the repaired sectors back the failed device. The check looks like this: ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); if (ret) goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); But inside btrfs_map_block(), we can modify bioc->mirror_num especially for dev-replace: if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) { ret = get_extra_mirror_from_replace(fs_info, logical, *length, dev_replace->srcdev->devid, &mirror_num, &physical_to_patch_in_first_stripe); patch_the_first_stripe_for_dev_replace = 1; } Thus if we're repairing the replace target device, we're going to trigger that BUG_ON(). But in reality, the read failure from the replace target device may be that, our replace hasn't reached the range we're reading, thus we're reading garbage, but with replace running, the range would be properly filled later. Thus in that case, we don't need to do anything but let the replace routine to handle it. [FIX] Instead of a BUG_ON(), just skip the repair if we're repairing the device replace target device. Reported-by: 小太 Link: https://lore.kernel.org/linux-btrfs/CACsxjPYyJGQZ+yvjzxA1Nn2LuqkYqTCcUH43S=+wXhyf8S00Ag@mail.gmail.com/ CC: stable@vger.kernel.org # 6.0+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b8fb7ef6b520..8affc88b0e0a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -329,7 +329,16 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, &map_length, &bioc, mirror_num); if (ret) goto out_counter_dec; - BUG_ON(mirror_num != bioc->mirror_num); + /* + * This happens when dev-replace is also running, and the + * mirror_num indicates the dev-replace target. + * + * In this case, we don't need to do anything, as the read + * error just means the replace progress hasn't reached our + * read range, and later replace routine would handle it well. + */ + if (mirror_num != bioc->mirror_num) + goto out_counter_dec; } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; -- cgit From 39f501d68ec1ed5cd5c66ac6ec2a7131c517bb92 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 26 Dec 2022 09:00:40 +0800 Subject: btrfs: always report error in run_one_delayed_ref() Currently we have a btrfs_debug() for run_one_delayed_ref() failure, but if end users hit such problem, there will be no chance that btrfs_debug() is enabled. This can lead to very little useful info for debugging. This patch will: - Add extra info for error reporting Including: * logical bytenr * num_bytes * type * action * ref_mod - Replace the btrfs_debug() with btrfs_err() - Move the error reporting into run_one_delayed_ref() This is to avoid use-after-free, the @node can be freed in the caller. This error should only be triggered at most once. As if run_one_delayed_ref() failed, we trigger the error message, then causing the call chain to error out: btrfs_run_delayed_refs() `- btrfs_run_delayed_refs() `- btrfs_run_delayed_refs_for_head() `- run_one_delayed_ref() And we will abort the current transaction in btrfs_run_delayed_refs(). If we have to run delayed refs for the abort transaction, run_one_delayed_ref() will just cleanup the refs and do nothing, thus no new error messages would be output. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 892d78c1853c..72ba13b027a9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1713,6 +1713,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG(); if (ret && insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + if (ret < 0) + btrfs_err(trans->fs_info, +"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", + node->bytenr, node->num_bytes, node->type, + node->action, node->ref_mod, ret); return ret; } @@ -1954,8 +1959,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, if (ret) { unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); - btrfs_debug(fs_info, "run_one_delayed_ref returned %d", - ret); return ret; } -- cgit From 2ba48b20049b5a76f34a85f853c9496d1b10533a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 22 Dec 2022 07:59:17 +0800 Subject: btrfs: fix compat_ro checks against remount [BUG] Even with commit 81d5d61454c3 ("btrfs: enhance unsupported compat RO flags handling"), btrfs can still mount a fs with unsupported compat_ro flags read-only, then remount it RW: # btrfs ins dump-super /dev/loop0 | grep compat_ro_flags -A 3 compat_ro_flags 0x403 ( FREE_SPACE_TREE | FREE_SPACE_TREE_VALID | unknown flag: 0x400 ) # mount /dev/loop0 /mnt/btrfs mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/loop0, missing codepage or helper program, or other error. dmesg(1) may have more information after failed mount system call. ^^^ RW mount failed as expected ^^^ # dmesg -t | tail -n5 loop0: detected capacity change from 0 to 1048576 BTRFS: device fsid cb5b82f5-0fdd-4d81-9b4b-78533c324afa devid 1 transid 7 /dev/loop0 scanned by mount (1146) BTRFS info (device loop0): using crc32c (crc32c-intel) checksum algorithm BTRFS info (device loop0): using free space tree BTRFS error (device loop0): cannot mount read-write because of unknown compat_ro features (0x403) BTRFS error (device loop0): open_ctree failed # mount /dev/loop0 -o ro /mnt/btrfs # mount -o remount,rw /mnt/btrfs ^^^ RW remount succeeded unexpectedly ^^^ [CAUSE] Currently we use btrfs_check_features() to check compat_ro flags against our current mount flags. That function get reused between open_ctree() and btrfs_remount(). But for btrfs_remount(), the super block we passed in still has the old mount flags, thus btrfs_check_features() still believes we're mounting read-only. [FIX] Replace the existing @sb argument with @is_rw_mount. As originally we only use @sb to determine if the mount is RW. Now it's callers' responsibility to determine if the mount is RW, and since there are only two callers, the check is pretty simple: - caller in open_ctree() Just pass !sb_rdonly(). - caller in btrfs_remount() Pass !(*flags & SB_RDONLY), as our check should be against the new flags. Now we can correctly reject the RW remount: # mount /dev/loop0 -o ro /mnt/btrfs # mount -o remount,rw /mnt/btrfs mount: /mnt/btrfs: mount point not mounted or bad option. dmesg(1) may have more information after failed mount system call. # dmesg -t | tail -n 1 BTRFS error (device loop0: state M): cannot mount read-write because of unknown compat_ro features (0x403) Reported-by: Chung-Chiang Cheng Fixes: 81d5d61454c3 ("btrfs: enhance unsupported compat RO flags handling") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 +++++--- fs/btrfs/disk-io.h | 2 +- fs/btrfs/super.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9940cc39dbc9..8aeaada1fcae 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3384,6 +3384,8 @@ out: /* * Do various sanity and dependency checks of different features. * + * @is_rw_mount: If the mount is read-write. + * * This is the place for less strict checks (like for subpage or artificial * feature dependencies). * @@ -3394,7 +3396,7 @@ out: * (space cache related) can modify on-disk format like free space tree and * screw up certain feature dependencies. */ -int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) +int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) { struct btrfs_super_block *disk_super = fs_info->super_copy; u64 incompat = btrfs_super_incompat_flags(disk_super); @@ -3433,7 +3435,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - if (compat_ro_unsupp && !sb_rdonly(sb)) { + if (compat_ro_unsupp && is_rw_mount) { btrfs_err(fs_info, "cannot mount read-write because of unknown compat_ro features (0x%llx)", compat_ro); @@ -3636,7 +3638,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - ret = btrfs_check_features(fs_info, sb); + ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) { err = ret; goto fail_alloc; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 363935cfc084..f2f295eb6103 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -50,7 +50,7 @@ int __cold open_ctree(struct super_block *sb, void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb, int mirror_num); -int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb); +int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d5de18d6517e..433ce221dc5c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1705,7 +1705,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; - ret = btrfs_check_features(fs_info, sb); + ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); if (ret < 0) goto restore; -- cgit From 55d235361fccef573990dfa5724ab453866e7816 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jan 2023 10:24:11 -0500 Subject: x86/asm: Fix an assembler warning with current binutils Fix a warning: "found `movsd'; assuming `movsl' was meant" Signed-off-by: Mikulas Patocka Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org --- arch/x86/lib/iomap_copy_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index a1f9416bf67a..6ff2f56cb0f7 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -10,6 +10,6 @@ */ SYM_FUNC_START(__iowrite32_copy) movl %edx,%ecx - rep movsd + rep movsl RET SYM_FUNC_END(__iowrite32_copy) -- cgit From b3d83066cbebc76dbac8a5fca931f64b4c6fff34 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 30 Dec 2022 23:43:32 +0800 Subject: f2fs: fix to avoid NULL pointer dereference in f2fs_issue_flush() With below two cases, it will cause NULL pointer dereference when accessing SM_I(sbi)->fcc_info in f2fs_issue_flush(). a) If kthread_run() fails in f2fs_create_flush_cmd_control(), it will release SM_I(sbi)->fcc_info, - mount -o noflush_merge /dev/vda /mnt/f2fs - mount -o remount,flush_merge /dev/vda /mnt/f2fs -- kthread_run() fails - dd if=/dev/zero of=/mnt/f2fs/file bs=4k count=1 conv=fsync b) we will never allocate memory for SM_I(sbi)->fcc_info w/ below testcase, - mount -o ro /dev/vda /mnt/f2fs - mount -o rw,remount /dev/vda /mnt/f2fs - dd if=/dev/zero of=/mnt/f2fs/file bs=4k count=1 conv=fsync In order to fix this issue, let change as below: - fix error path handling in f2fs_create_flush_cmd_control(). - allocate SM_I(sbi)->fcc_info even if readonly is on. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 25ddea478fc1..c3f8c8208eec 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -663,8 +663,7 @@ init_thread: if (IS_ERR(fcc->f2fs_issue_flush)) { int err = PTR_ERR(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->fcc_info = NULL; + fcc->f2fs_issue_flush = NULL; return err; } @@ -5138,11 +5137,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) init_f2fs_rwsem(&sm_info->curseg_lock); - if (!f2fs_readonly(sbi->sb)) { - err = f2fs_create_flush_cmd_control(sbi); - if (err) - return err; - } + err = f2fs_create_flush_cmd_control(sbi); + if (err) + return err; err = create_discard_cmd_control(sbi); if (err) -- cgit From fe59109ae5c0b34a8c7c07f693fc501b12b57787 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Dec 2022 14:05:44 -0800 Subject: f2fs: initialize extent_cache parameter This can avoid confusing tracepoint values. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6e43e19c7d1c..97e816590cd9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2183,7 +2183,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct extent_info ei = {0, }; + struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1bd38a78ebba..3aa2f8296045 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -938,7 +938,7 @@ out: static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { - struct extent_info ei; + struct extent_info ei = {}; if (!__may_extent_tree(dn->inode, type)) return; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a6c401279886..ecbc8c135b49 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2559,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, }; + struct extent_info ei = {}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c3f8c8208eec..ae3c4e5474ef 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3160,7 +3160,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_info ei; + struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) -- cgit From ed2724765e58e3310d3de48f4a1761631b3dd640 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Dec 2022 14:41:54 -0800 Subject: f2fs: don't mix to use union values in extent_info Let's explicitly use the defined values in block_age case only. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 3aa2f8296045..cc3fed04dd6f 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -881,7 +881,8 @@ static unsigned long long __calculate_block_age(unsigned long long new, } /* This returns a new age and allocated blocks in ei */ -static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +static int __get_new_block_age(struct inode *inode, struct extent_info *ei, + block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t f_size = i_size_read(inode); @@ -894,7 +895,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) * block here. */ if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && - ei->blk == NEW_ADDR) + blkaddr == NEW_ADDR) return -EINVAL; if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { @@ -915,14 +916,14 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) return 0; } - f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); /* the data block was allocated for the first time */ - if (ei->blk == NEW_ADDR) + if (blkaddr == NEW_ADDR) goto out; - if (__is_valid_data_blkaddr(ei->blk) && - !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_bug_on(sbi, 1); return -EINVAL; } @@ -953,8 +954,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ else ei.blk = dn->data_blkaddr; } else if (type == EX_BLOCK_AGE) { - ei.blk = dn->data_blkaddr; - if (__get_new_block_age(dn->inode, &ei)) + if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr)) return; } __update_extent_tree_range(dn->inode, &ei, type); -- cgit From 22a341b43036415718f2d50f5f98b2f891fe17e9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Dec 2022 16:36:36 -0800 Subject: f2fs: should use a temp extent_info for lookup Otherwise, __lookup_extent_tree() will override the given extent_info which will be used by caller. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index cc3fed04dd6f..7b191ff65631 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -888,6 +888,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, loff_t f_size = i_size_read(inode); unsigned long long cur_blocks = atomic64_read(&sbi->allocated_data_blocks); + struct extent_info tei = *ei; /* only fofs and len are valid */ /* * When I/O is not aligned to a PAGE_SIZE, update will happen to the last @@ -898,17 +899,17 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, blkaddr == NEW_ADDR) return -EINVAL; - if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) { unsigned long long cur_age; - if (cur_blocks >= ei->last_blocks) - cur_age = cur_blocks - ei->last_blocks; + if (cur_blocks >= tei.last_blocks) + cur_age = cur_blocks - tei.last_blocks; else /* allocated_data_blocks overflow */ - cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; - if (ei->age) - ei->age = __calculate_block_age(cur_age, ei->age); + if (tei.age) + ei->age = __calculate_block_age(cur_age, tei.age); else ei->age = cur_age; ei->last_blocks = cur_blocks; -- cgit From df9d44b645b83fffccfb4e28c1f93376585fdec8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2022 16:14:10 -0800 Subject: f2fs: let's avoid panic if extent_tree is not created This patch avoids the below panic. pc : __lookup_extent_tree+0xd8/0x760 lr : f2fs_do_write_data_page+0x104/0x87c sp : ffffffc010cbb3c0 x29: ffffffc010cbb3e0 x28: 0000000000000000 x27: ffffff8803e7f020 x26: ffffff8803e7ed40 x25: ffffff8803e7f020 x24: ffffffc010cbb460 x23: ffffffc010cbb480 x22: 0000000000000000 x21: 0000000000000000 x20: ffffffff22e90900 x19: 0000000000000000 x18: ffffffc010c5d080 x17: 0000000000000000 x16: 0000000000000020 x15: ffffffdb1acdbb88 x14: ffffff888759e2b0 x13: 0000000000000000 x12: ffffff802da49000 x11: 000000000a001200 x10: ffffff8803e7ed40 x9 : ffffff8023195800 x8 : ffffff802da49078 x7 : 0000000000000001 x6 : 0000000000000000 x5 : 0000000000000006 x4 : ffffffc010cbba28 x3 : 0000000000000000 x2 : ffffffc010cbb480 x1 : 0000000000000000 x0 : ffffff8803e7ed40 Call trace: __lookup_extent_tree+0xd8/0x760 f2fs_do_write_data_page+0x104/0x87c f2fs_write_single_data_page+0x420/0xb60 f2fs_write_cache_pages+0x418/0xb1c __f2fs_write_data_pages+0x428/0x58c f2fs_write_data_pages+0x30/0x40 do_writepages+0x88/0x190 __writeback_single_inode+0x48/0x448 writeback_sb_inodes+0x468/0x9e8 __writeback_inodes_wb+0xb8/0x2a4 wb_writeback+0x33c/0x740 wb_do_writeback+0x2b4/0x400 wb_workfn+0xe4/0x34c process_one_work+0x24c/0x5bc worker_thread+0x3e8/0xa50 kthread+0x150/0x1b4 Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7b191ff65631..342af24b2f8c 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -546,7 +546,8 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_node *en; bool ret = false; - f2fs_bug_on(sbi, !et); + if (!et) + return false; trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); -- cgit From 72bb8f8cc088730c4d84117a6906f458c2fc64bb Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 1 Jan 2023 17:29:04 +0100 Subject: x86/insn: Avoid namespace clash by separating instruction decoder MMIO type from MMIO trace type Both and define various MMIO_ enum constants, whose namespace overlaps. Rename the ones to have a INSN_ prefix, so that the headers can be used from the same source file. Signed-off-by: Jason A. Donenfeld Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230101162910.710293-2-Jason@zx2c4.com --- arch/x86/coco/tdx/tdx.c | 26 +++++++++++++------------- arch/x86/include/asm/insn-eval.h | 18 +++++++++--------- arch/x86/kernel/sev.c | 18 +++++++++--------- arch/x86/lib/insn-eval.c | 20 ++++++++++---------- 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index cfd4c95b9f04..669d9e4f2901 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -386,8 +386,8 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) { unsigned long *reg, val, vaddr; char buffer[MAX_INSN_SIZE]; + enum insn_mmio_type mmio; struct insn insn = {}; - enum mmio_type mmio; int size, extend_size; u8 extend_val = 0; @@ -402,10 +402,10 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EINVAL; mmio = insn_decode_mmio(&insn, &size); - if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) + if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED)) return -EINVAL; - if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { reg = insn_get_modrm_reg_ptr(&insn, regs); if (!reg) return -EINVAL; @@ -426,23 +426,23 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) /* Handle writes first */ switch (mmio) { - case MMIO_WRITE: + case INSN_MMIO_WRITE: memcpy(&val, reg, size); if (!mmio_write(size, ve->gpa, val)) return -EIO; return insn.length; - case MMIO_WRITE_IMM: + case INSN_MMIO_WRITE_IMM: val = insn.immediate.value; if (!mmio_write(size, ve->gpa, val)) return -EIO; return insn.length; - case MMIO_READ: - case MMIO_READ_ZERO_EXTEND: - case MMIO_READ_SIGN_EXTEND: + case INSN_MMIO_READ: + case INSN_MMIO_READ_ZERO_EXTEND: + case INSN_MMIO_READ_SIGN_EXTEND: /* Reads are handled below */ break; - case MMIO_MOVS: - case MMIO_DECODE_FAILED: + case INSN_MMIO_MOVS: + case INSN_MMIO_DECODE_FAILED: /* * MMIO was accessed with an instruction that could not be * decoded or handled properly. It was likely not using io.h @@ -459,15 +459,15 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EIO; switch (mmio) { - case MMIO_READ: + case INSN_MMIO_READ: /* Zero-extend for 32-bit operation */ extend_size = size == 4 ? sizeof(*reg) : 0; break; - case MMIO_READ_ZERO_EXTEND: + case INSN_MMIO_READ_ZERO_EXTEND: /* Zero extend based on operand size */ extend_size = insn.opnd_bytes; break; - case MMIO_READ_SIGN_EXTEND: + case INSN_MMIO_READ_SIGN_EXTEND: /* Sign extend based on operand size */ extend_size = insn.opnd_bytes; if (size == 1 && val & BIT(7)) diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index f07faa61c7f3..54368a43abf6 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -32,16 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); -enum mmio_type { - MMIO_DECODE_FAILED, - MMIO_WRITE, - MMIO_WRITE_IMM, - MMIO_READ, - MMIO_READ_ZERO_EXTEND, - MMIO_READ_SIGN_EXTEND, - MMIO_MOVS, +enum insn_mmio_type { + INSN_MMIO_DECODE_FAILED, + INSN_MMIO_WRITE, + INSN_MMIO_WRITE_IMM, + INSN_MMIO_READ, + INSN_MMIO_READ_ZERO_EXTEND, + INSN_MMIO_READ_SIGN_EXTEND, + INSN_MMIO_MOVS, }; -enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a428c62330d3..679026a640ef 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1536,32 +1536,32 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct insn *insn = &ctxt->insn; + enum insn_mmio_type mmio; unsigned int bytes = 0; - enum mmio_type mmio; enum es_result ret; u8 sign_byte; long *reg_data; mmio = insn_decode_mmio(insn, &bytes); - if (mmio == MMIO_DECODE_FAILED) + if (mmio == INSN_MMIO_DECODE_FAILED) return ES_DECODE_FAILED; - if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); if (!reg_data) return ES_DECODE_FAILED; } switch (mmio) { - case MMIO_WRITE: + case INSN_MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - case MMIO_WRITE_IMM: + case INSN_MMIO_WRITE_IMM: memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - case MMIO_READ: + case INSN_MMIO_READ: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; @@ -1572,7 +1572,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) memcpy(reg_data, ghcb->shared_buffer, bytes); break; - case MMIO_READ_ZERO_EXTEND: + case INSN_MMIO_READ_ZERO_EXTEND: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; @@ -1581,7 +1581,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) memset(reg_data, 0, insn->opnd_bytes); memcpy(reg_data, ghcb->shared_buffer, bytes); break; - case MMIO_READ_SIGN_EXTEND: + case INSN_MMIO_READ_SIGN_EXTEND: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; @@ -1600,7 +1600,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) memset(reg_data, sign_byte, insn->opnd_bytes); memcpy(reg_data, ghcb->shared_buffer, bytes); break; - case MMIO_MOVS: + case INSN_MMIO_MOVS: ret = vc_handle_mmio_movs(ctxt, bytes); break; default: diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 21104c41cba0..558a605929db 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1595,16 +1595,16 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, * Returns: * * Type of the instruction. Size of the memory operand is stored in - * @bytes. If decode failed, MMIO_DECODE_FAILED returned. + * @bytes. If decode failed, INSN_MMIO_DECODE_FAILED returned. */ -enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) +enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes) { - enum mmio_type type = MMIO_DECODE_FAILED; + enum insn_mmio_type type = INSN_MMIO_DECODE_FAILED; *bytes = 0; if (insn_get_opcode(insn)) - return MMIO_DECODE_FAILED; + return INSN_MMIO_DECODE_FAILED; switch (insn->opcode.bytes[0]) { case 0x88: /* MOV m8,r8 */ @@ -1613,7 +1613,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_WRITE; + type = INSN_MMIO_WRITE; break; case 0xc6: /* MOV m8, imm8 */ @@ -1622,7 +1622,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_WRITE_IMM; + type = INSN_MMIO_WRITE_IMM; break; case 0x8a: /* MOV r8, m8 */ @@ -1631,7 +1631,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_READ; + type = INSN_MMIO_READ; break; case 0xa4: /* MOVS m8, m8 */ @@ -1640,7 +1640,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */ if (!*bytes) *bytes = insn->opnd_bytes; - type = MMIO_MOVS; + type = INSN_MMIO_MOVS; break; case 0x0f: /* Two-byte instruction */ @@ -1651,7 +1651,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xb7: /* MOVZX r32/r64, m16 */ if (!*bytes) *bytes = 2; - type = MMIO_READ_ZERO_EXTEND; + type = INSN_MMIO_READ_ZERO_EXTEND; break; case 0xbe: /* MOVSX r16/r32/r64, m8 */ @@ -1660,7 +1660,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) case 0xbf: /* MOVSX r32/r64, m16 */ if (!*bytes) *bytes = 2; - type = MMIO_READ_SIGN_EXTEND; + type = INSN_MMIO_READ_SIGN_EXTEND; break; } break; -- cgit From c07311b5509f6035f1dd828db3e90ff4859cf3b9 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 28 Dec 2022 06:34:54 -0500 Subject: perf/x86/rapl: Treat Tigerlake like Icelake Since Tigerlake seems to have inherited its cstates and other RAPL power caps from Icelake, assume it also follows Icelake for its RAPL events. Signed-off-by: Chris Wilson Signed-off-by: Rodrigo Vivi Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Zhang Rui Link: https://lore.kernel.org/r/20221228113454.1199118-1-rodrigo.vivi@intel.com --- arch/x86/events/rapl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index a829492bca4c..ae5779ea4417 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -800,6 +800,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), -- cgit From f89fb55714b620ff1352141a9f9315611f16573e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 2 Jan 2023 23:09:16 -0800 Subject: perf build: Don't propagate subdir to submakes for install_headers subdir is added to the OUTPUT which fails as part of building install_headers when passed from "make -C tools perf_install". Committer testing: The original reporter (see the Link: below) had trouble with this: $ make -C tools perf_install That ended up with errors like this: /var/home/acme/git/perf-urgent/tools/scripts/Makefile.include:17: *** output directory "/var/home/acme/git/perf-urgent/tools/perf/libperf/perf/" does not exist. Stop. With this patch applied we now get it installed at: INSTALL /var/home/acme/git/perf-urgent/tools/perf/libperf/include/perf/bpf_perf.h As expected: $ ls -la /var/home/acme/git/perf-urgent/tools/perf/libperf/include/perf/bpf_perf.h -rw-r--r--. 1 acme acme 1146 Jan 3 15:42 /var/home/acme/git/perf-urgent/tools/perf/libperf/include/perf/bpf_perf.h And if we clean tools with: $ make -C tools clean it gets cleaned up: $ ls -la /var/home/acme/git/perf-urgent/tools/perf/libperf/include/perf/bpf_perf.h ls: cannot access '/var/home/acme/git/perf-urgent/tools/perf/libperf/include/perf/bpf_perf.h': No such file or directory $ Fixes: 746bd29e348f99b4 ("perf build: Use tools/lib headers from install path") Reported-by: Torsten Hilbrich Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/fa4b3115-d555-3d7f-54d1-018002e99350@secunet.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 13e7d26e77f0..1e32c93b8042 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -819,7 +819,7 @@ $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) $(LIBAPI): FORCE | $(LIBAPI_OUTPUT) $(Q)$(MAKE) -C $(LIBAPI_DIR) O=$(LIBAPI_OUTPUT) \ - DESTDIR=$(LIBAPI_DESTDIR) prefix= \ + DESTDIR=$(LIBAPI_DESTDIR) prefix= subdir= \ $@ install_headers $(LIBAPI)-clean: @@ -828,7 +828,7 @@ $(LIBAPI)-clean: $(LIBBPF): FORCE | $(LIBBPF_OUTPUT) $(Q)$(MAKE) -C $(LIBBPF_DIR) FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) \ - O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= subdir= \ $@ install_headers $(LIBBPF)-clean: @@ -837,7 +837,7 @@ $(LIBBPF)-clean: $(LIBPERF): FORCE | $(LIBPERF_OUTPUT) $(Q)$(MAKE) -C $(LIBPERF_DIR) O=$(LIBPERF_OUTPUT) \ - DESTDIR=$(LIBPERF_DESTDIR) prefix= \ + DESTDIR=$(LIBPERF_DESTDIR) prefix= subdir= \ $@ install_headers $(LIBPERF)-clean: @@ -846,7 +846,7 @@ $(LIBPERF)-clean: $(LIBSUBCMD): FORCE | $(LIBSUBCMD_OUTPUT) $(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \ - DESTDIR=$(LIBSUBCMD_DESTDIR) prefix= \ + DESTDIR=$(LIBSUBCMD_DESTDIR) prefix= subdir= \ $@ install_headers $(LIBSUBCMD)-clean: @@ -855,7 +855,7 @@ $(LIBSUBCMD)-clean: $(LIBSYMBOL): FORCE | $(LIBSYMBOL_OUTPUT) $(Q)$(MAKE) -C $(LIBSYMBOL_DIR) O=$(LIBSYMBOL_OUTPUT) \ - DESTDIR=$(LIBSYMBOL_DESTDIR) prefix= \ + DESTDIR=$(LIBSYMBOL_DESTDIR) prefix= subdir= \ $@ install_headers $(LIBSYMBOL)-clean: -- cgit From d8d85ce86dc82de4f88b821a78f533b9d5b22a45 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 30 Dec 2022 11:26:27 +0100 Subject: perf lock contention: Fix core dump related to not finding the "__sched_text_end" symbol on s/390 The test case perf lock contention dumps core on s390. Run the following commands: # ./perf lock record -- ./perf bench sched messaging # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 2.799 [sec] [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.073 MB perf.data (100 samples) ] # # ./perf lock contention Segmentation fault (core dumped) # The function call stack is lengthy, here are the top 5 functions: # gdb ./perf core.24048 GNU gdb (GDB) Fedora Linux 12.1-6.fc37 Core was generated by `./perf lock contention'. Program terminated with signal SIGSEGV, Segmentation fault. #0 0x00000000011dd25c in machine__is_lock_function (machine=0x3029e28, addr=1789230) at util/machine.c:3356 3356 machine->sched.text_end = kmap->unmap_ip(kmap, sym->start); (gdb) where #0 0x00000000011dd25c in machine__is_lock_function (machine=0x3029e28, addr=1789230) at util/machine.c:3356 #1 0x000000000109f244 in callchain_id (evsel=0x30313e0, sample=0x3ffea4f77d0) at builtin-lock.c:957 #2 0x000000000109e094 in get_key_by_aggr_mode (key=0x3ffea4f7290, addr=27758136, evsel=0x30313e0, sample=0x3ffea4f77d0) at builtin-lock.c:586 #3 0x000000000109f4d0 in report_lock_contention_begin_event (evsel=0x30313e0, sample=0x3ffea4f77d0) at builtin-lock.c:1004 #4 0x00000000010a00ae in evsel__process_contention_begin (evsel=0x30313e0, sample=0x3ffea4f77d0) at builtin-lock.c:1254 #5 0x00000000010a0e14 in process_sample_event (tool=0x3ffea4f8480, event=0x3ff85601ef8, sample=0x3ffea4f77d0, evsel=0x30313e0, machine=0x3029e28) at builtin-lock.c:1464 ..... The issue is in function machine__is_lock_function() in file ./util/machine.c lines 3355: /* should not fail from here */ sym = machine__find_kernel_symbol_by_name(machine, "__sched_text_end", &kmap); machine->sched.text_end = kmap->unmap_ip(kmap, sym->start) On s390 the symbol __sched_text_end is *NOT* in the symbol list and the resulting pointer sym is set to NULL. The sym->start is then a NULL pointer access and generates the core dump. The reason why __sched_text_end is not in the symbol list on s390 is simple: When the symbol list is created at perf start up with function calls dso__load +--> dso__load_vmlinux_path +--> dso__load_vmlinux +--> dso__load_sym +--> dso__load_sym_internal (reads kernel symbols) +--> symbols__fixup_end +--> symbols__fixup_duplicate The issue is in function symbols__fixup_duplicate(). It deletes all symbols with have the same address. On s390: # nm -g ~/linux/vmlinux| fgrep c68390 0000000000c68390 T __cpuidle_text_start 0000000000c68390 T __sched_text_end # two symbols have identical addresses and __sched_text_end is considered duplicate (in ascending sort order) and removed from the symbol list. Therefore it is missing and an invalid pointer reference occurs. The code checks for symbol __sched_text_start and when it exists assumes symbol __sched_text_end is also in the symbol table. However this is not the case on s390. Same situation exists for symbol __lock_text_start: 0000000000c68770 T __cpuidle_text_end 0000000000c68770 T __lock_text_start This symbol is also removed from the symbol table but used in function machine__is_lock_function(). To fix this and keep duplicate symbols in the symbol table, set symbol_conf.allow_aliases to true. This prevents the removal of duplicate symbols in function symbols__fixup_duplicate(). Output After: # ./perf lock contention contended total wait max wait avg wait type caller 48 124.39 ms 123.99 ms 2.59 ms rwsem:W unlink_anon_vmas+0x24a 47 83.68 ms 83.26 ms 1.78 ms rwsem:W free_pgtables+0x132 5 41.22 us 10.55 us 8.24 us rwsem:W free_pgtables+0x140 4 40.12 us 20.55 us 10.03 us rwsem:W copy_process+0x1ac8 # Fixes: 0d2997f750d1de39 ("perf lock: Look up callchain for the contended locks") Signed-off-by: Thomas Richter Acked-by: Namhyung Kim Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20221230102627.2410847-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 718b82bfcdff..506c2fe42d52 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1670,6 +1670,7 @@ static int __cmd_report(bool display_info) /* for lock function check */ symbol_conf.sort_by_name = true; + symbol_conf.allow_aliases = true; symbol__init(&session->header.env); if (!data.is_pipe) { @@ -1757,6 +1758,7 @@ static int __cmd_contention(int argc, const char **argv) /* for lock function check */ symbol_conf.sort_by_name = true; + symbol_conf.allow_aliases = true; symbol__init(&session->header.env); if (use_bpf) { -- cgit From cf97eb7e47d4671084c7e114c5d88a3d0540ecbd Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Tue, 20 Dec 2022 17:11:24 -0500 Subject: drm/amdkfd: Fix kernel warning during topology setup This patch fixes the following kernel warning seen during driver load by correctly initializing the p2plink attr before creating the sysfs file: [ +0.002865] ------------[ cut here ]------------ [ +0.002327] kobject: '(null)' (0000000056260cfb): is not initialized, yet kobject_put() is being called. [ +0.004780] WARNING: CPU: 32 PID: 1006 at lib/kobject.c:718 kobject_put+0xaa/0x1c0 [ +0.001361] Call Trace: [ +0.001234] [ +0.001067] kfd_remove_sysfs_node_entry+0x24a/0x2d0 [amdgpu] [ +0.003147] kfd_topology_update_sysfs+0x3d/0x750 [amdgpu] [ +0.002890] kfd_topology_add_device+0xbd7/0xc70 [amdgpu] [ +0.002844] ? lock_release+0x13c/0x2e0 [ +0.001936] ? smu_cmn_send_smc_msg_with_param+0x1e8/0x2d0 [amdgpu] [ +0.003313] ? amdgpu_dpm_get_mclk+0x54/0x60 [amdgpu] [ +0.002703] kgd2kfd_device_init.cold+0x39f/0x4ed [amdgpu] [ +0.002930] amdgpu_amdkfd_device_init+0x13d/0x1f0 [amdgpu] [ +0.002944] amdgpu_device_init.cold+0x1464/0x17b4 [amdgpu] [ +0.002970] ? pci_bus_read_config_word+0x43/0x80 [ +0.002380] amdgpu_driver_load_kms+0x15/0x100 [amdgpu] [ +0.002744] amdgpu_pci_probe+0x147/0x370 [amdgpu] [ +0.002522] local_pci_probe+0x40/0x80 [ +0.001896] work_for_cpu_fn+0x10/0x20 [ +0.001892] process_one_work+0x26e/0x5a0 [ +0.002029] worker_thread+0x1fd/0x3e0 [ +0.001890] ? process_one_work+0x5a0/0x5a0 [ +0.002115] kthread+0xea/0x110 [ +0.001618] ? kthread_complete_and_exit+0x20/0x20 [ +0.002422] ret_from_fork+0x1f/0x30 [ +0.001808] [ +0.001103] irq event stamp: 59837 [ +0.001718] hardirqs last enabled at (59849): [] __up_console_sem+0x52/0x60 [ +0.004414] hardirqs last disabled at (59860): [] __up_console_sem+0x37/0x60 [ +0.004414] softirqs last enabled at (59654): [] irq_exit_rcu+0xd7/0x130 [ +0.004205] softirqs last disabled at (59649): [] irq_exit_rcu+0xd7/0x130 [ +0.004203] ---[ end trace 0000000000000000 ]--- Fixes: 0f28cca87e9a ("drm/amdkfd: Extend KFD device topology to surface peer-to-peer links") Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index bceb1a5b2518..3fdaba56be6f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -801,7 +801,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, p2plink->attr.name = "properties"; p2plink->attr.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&iolink->attr); + sysfs_attr_init(&p2plink->attr); ret = sysfs_create_file(p2plink->kobj, &p2plink->attr); if (ret < 0) return ret; -- cgit From f3c23bea598ab7e8e4b8c5ca66598921310f718e Mon Sep 17 00:00:00 2001 From: Samson Tam Date: Mon, 5 Dec 2022 11:08:40 -0500 Subject: drm/amd/display: Uninitialized variables causing 4k60 UCLK to stay at DPM1 and not DPM0 [Why] SwathSizePerSurfaceY[] and SwathSizePerSurfaceC[] values are uninitialized because we are using += instead of = operator. [How] Assign values in loop with = operator. Acked-by: Aurabindo Pillai Signed-off-by: Samson Tam Reviewed-by: Aric Cyr Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.0.x, 6.1.x --- drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c index 5af601cff1a0..b53feeaf5cf1 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.c @@ -6257,12 +6257,12 @@ bool dml32_CalculateDETSwathFillLatencyHiding(unsigned int NumberOfActiveSurface double SwathSizePerSurfaceC[DC__NUM_DPP__MAX]; bool NotEnoughDETSwathFillLatencyHiding = false; - /* calculate sum of single swath size for all pipes in bytes*/ + /* calculate sum of single swath size for all pipes in bytes */ for (k = 0; k < NumberOfActiveSurfaces; k++) { - SwathSizePerSurfaceY[k] += SwathHeightY[k] * SwathWidthY[k] * BytePerPixelInDETY[k] * NumOfDPP[k]; + SwathSizePerSurfaceY[k] = SwathHeightY[k] * SwathWidthY[k] * BytePerPixelInDETY[k] * NumOfDPP[k]; if (SwathHeightC[k] != 0) - SwathSizePerSurfaceC[k] += SwathHeightC[k] * SwathWidthC[k] * BytePerPixelInDETC[k] * NumOfDPP[k]; + SwathSizePerSurfaceC[k] = SwathHeightC[k] * SwathWidthC[k] * BytePerPixelInDETC[k] * NumOfDPP[k]; else SwathSizePerSurfaceC[k] = 0; -- cgit From 2a12187d5853d9fd5102278cecef7dac7c8ce7ea Mon Sep 17 00:00:00 2001 From: Andreas Rammhold Date: Fri, 23 Dec 2022 12:27:47 +0100 Subject: of/fdt: run soc memory setup when early_init_dt_scan_memory fails If memory has been found early_init_dt_scan_memory now returns 1. If it hasn't found any memory it will return 0, allowing other memory setup mechanisms to carry on. Previously early_init_dt_scan_memory always returned 0 without distinguishing between any kind of memory setup being done or not. Any code path after the early_init_dt_scan memory call in the ramips plat_mem_setup code wouldn't be executed anymore. Making early_init_dt_scan_memory the only way to initialize the memory. Some boards, including my mt7621 based Cudy X6 board, depend on memory initialization being done via the soc_info.mem_detect function pointer. Those wouldn't be able to obtain memory and panic the kernel during early bootup with the message "early_init_dt_alloc_memory_arch: Failed to allocate 12416 bytes align=0x40". Fixes: 1f012283e936 ("of/fdt: Rework early_init_dt_scan_memory() to call directly") Cc: stable@vger.kernel.org Signed-off-by: Andreas Rammhold Link: https://lore.kernel.org/r/20221223112748.2935235-1-andreas@rammhold.de Signed-off-by: Rob Herring --- arch/mips/ralink/of.c | 2 +- drivers/of/fdt.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/mips/ralink/of.c b/arch/mips/ralink/of.c index 01c132bc33d5..4d06de77d92a 100644 --- a/arch/mips/ralink/of.c +++ b/arch/mips/ralink/of.c @@ -64,7 +64,7 @@ void __init plat_mem_setup(void) dtb = get_fdt(); __dt_setup_arch(dtb); - if (!early_init_dt_scan_memory()) + if (early_init_dt_scan_memory()) return; if (soc_info.mem_detect) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index b2272bccf85c..02cc4a285cb9 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1099,7 +1099,7 @@ u64 __init dt_mem_next_cell(int s, const __be32 **cellp) */ int __init early_init_dt_scan_memory(void) { - int node; + int node, found_memory = 0; const void *fdt = initial_boot_params; fdt_for_each_subnode(node, fdt, 0) { @@ -1139,6 +1139,8 @@ int __init early_init_dt_scan_memory(void) early_init_dt_add_memory_arch(base, size); + found_memory = 1; + if (!hotpluggable) continue; @@ -1147,7 +1149,7 @@ int __init early_init_dt_scan_memory(void) base, base + size); } } - return 0; + return found_memory; } int __init early_init_dt_scan_chosen(char *cmdline) -- cgit From 1d7a4a40bf76e2305c8beed4a019bf58af6121ac Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 23 Dec 2022 14:21:59 +0100 Subject: dt-bindings: soundwire: qcom,soundwire: correct sizes related to number of ports There are several properties depending on number of ports. Some of them had maximum limit of 5 and some of 8. SM8450 AudioReach comes with 8 ports, so fix the limits: sm8450-sony-xperia-nagara-pdx224.dtb: soundwire-controller@3250000: qcom,ports-word-length: 'oneOf' conditional failed, one must be fixed: [[255, 255, 255, 255, 255, 255, 255, 255]] is too short [255, 255, 255, 255, 255, 255, 255, 255] is too long Fixes: febc50b82bc9 ("dt-bindings: soundwire: Convert text bindings to DT Schema") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221223132159.81211-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- .../devicetree/bindings/soundwire/qcom,soundwire.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/soundwire/qcom,soundwire.yaml b/Documentation/devicetree/bindings/soundwire/qcom,soundwire.yaml index bcbfa71536cd..3efdc192ab01 100644 --- a/Documentation/devicetree/bindings/soundwire/qcom,soundwire.yaml +++ b/Documentation/devicetree/bindings/soundwire/qcom,soundwire.yaml @@ -80,7 +80,7 @@ properties: or applicable for the respective data port. More info in MIPI Alliance SoundWire 1.0 Specifications. minItems: 3 - maxItems: 5 + maxItems: 8 qcom,ports-sinterval-low: $ref: /schemas/types.yaml#/definitions/uint8-array @@ -124,7 +124,7 @@ properties: or applicable for the respective data port. More info in MIPI Alliance SoundWire 1.0 Specifications. minItems: 3 - maxItems: 5 + maxItems: 8 qcom,ports-block-pack-mode: $ref: /schemas/types.yaml#/definitions/uint8-array @@ -154,7 +154,7 @@ properties: or applicable for the respective data port. More info in MIPI Alliance SoundWire 1.0 Specifications. minItems: 3 - maxItems: 5 + maxItems: 8 items: oneOf: - minimum: 0 @@ -171,7 +171,7 @@ properties: or applicable for the respective data port. More info in MIPI Alliance SoundWire 1.0 Specifications. minItems: 3 - maxItems: 5 + maxItems: 8 items: oneOf: - minimum: 0 @@ -187,7 +187,7 @@ properties: or applicable for the respective data port. More info in MIPI Alliance SoundWire 1.0 Specifications. minItems: 3 - maxItems: 5 + maxItems: 8 items: oneOf: - minimum: 0 -- cgit From 9ffa13ff78a0a55df968a72d6f0ebffccee5c9f4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Jan 2023 01:34:02 +0000 Subject: io_uring: pin context while queueing deferred tw Unlike normal tw, nothing prevents deferred tw to be executed right after an tw item added to ->work_llist in io_req_local_work_add(). For instance, the waiting task may get waken up by CQ posting or a normal tw. Thus we need to pin the ring for the rest of io_req_local_work_add() Cc: stable@vger.kernel.org Fixes: c0e0d6ba25f18 ("io_uring: add IORING_SETUP_DEFER_TASKRUN") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1a79362b9c10b8523ef70b061d96523650a23344.1672795998.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58ac13b69dc8..6bed44855679 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1236,13 +1236,18 @@ static void io_req_local_work_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + percpu_ref_get(&ctx->refs); + + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { + percpu_ref_put(&ctx->refs); return; + } /* need it for the following io_cqring_wake() */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { io_move_task_work_from_local(ctx); + percpu_ref_put(&ctx->refs); return; } @@ -1252,6 +1257,7 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->has_evfd) io_eventfd_signal(ctx); __io_cqring_wake(ctx); + percpu_ref_put(&ctx->refs); } void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) -- cgit From f26cc9593581bd734c846bf827401350b36dc3c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Jan 2023 01:34:57 +0000 Subject: io_uring: lockdep annotate CQ locking Locking around CQE posting is complex and depends on options the ring is created with, add more thorough lockdep annotations checking all invariants. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/aa3770b4eacae3915d782cc2ab2f395a99b4b232.1672795976.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- io_uring/io_uring.h | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6bed44855679..472574192dd6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -731,6 +731,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, size_t ocq_size = sizeof(struct io_overflow_cqe); bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + lockdep_assert_held(&ctx->completion_lock); + if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); @@ -820,9 +822,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - if (!ctx->task_complete) - lockdep_assert_held(&ctx->completion_lock); - ctx->cq_extra++; /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e9f0d41ebb99..ab4b2a1c3b7e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -79,6 +79,19 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); +#define io_lockdep_assert_cq_locked(ctx) \ + do { \ + if (ctx->flags & IORING_SETUP_IOPOLL) { \ + lockdep_assert_held(&ctx->uring_lock); \ + } else if (!ctx->task_complete) { \ + lockdep_assert_held(&ctx->completion_lock); \ + } else if (ctx->submitter_task->flags & PF_EXITING) { \ + lockdep_assert(current_work()); \ + } else { \ + lockdep_assert(current == ctx->submitter_task); \ + } \ + } while (0) + static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, true); @@ -92,6 +105,8 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx); static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, bool overflow) { + io_lockdep_assert_cq_locked(ctx); + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { struct io_uring_cqe *cqe = ctx->cqe_cached; -- cgit From 67fcb2c598bc7643f694e8194d5c300a52af5aa9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 29 Dec 2022 14:04:46 -0800 Subject: cifs: Fix kmap_local_page() unmapping kmap_local_page() requires kunmap_local() to unmap the mapping. In addition memcpy_page() is provided to perform this common memcpy pattern. Replace the kmap_local_page() and broken kunmap() with memcpy_page() Fixes: d406d26745ab ("cifs: skip alloc when request has no pages") Reviewed-by: Paulo Alcantara Reviewed-by: "Fabio M. De Francesco" Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Ira Weiny Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dc160de7a6de..0d7e9bcd9f34 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4488,17 +4488,12 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, /* copy pages form the old */ for (j = 0; j < npages; j++) { - char *dst, *src; unsigned int offset, len; rqst_page_get_length(new, j, &len, &offset); - dst = kmap_local_page(new->rq_pages[j]) + offset; - src = kmap_local_page(old->rq_pages[j]) + offset; - - memcpy(dst, src, len); - kunmap(new->rq_pages[j]); - kunmap(old->rq_pages[j]); + memcpy_page(new->rq_pages[j], offset, + old->rq_pages[j], offset, len); } } -- cgit From 9e6002c8738a9d5675ba706fcdbc0a544f814974 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 29 Dec 2022 12:33:55 -0300 Subject: cifs: ignore ipc reconnect failures during dfs failover If it failed to reconnect ipc used for getting referrals, we can just ignore it as it is not required for reconnecting the share. The worst case would be not being able to detect or chase nested links as long as dfs root server is unreachable. Before patch: $ mount.cifs //root/dfs/link /mnt -o echo_interval=10,... -> target share: /fs0/share disconnect root & fs0 $ ls /mnt ls: cannot access '/mnt': Host is down connect fs0 $ ls /mnt ls: cannot access '/mnt': Resource temporarily unavailable After patch: $ mount.cifs //root/dfs/link /mnt -o echo_interval=10,... -> target share: /fs0/share disconnect root & fs0 $ ls /mnt ls: cannot access '/mnt': Host is down connect fs0 $ ls /mnt bar.rtf dir1 foo Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/cifs/dfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c index b541e68378f6..30086f2060a1 100644 --- a/fs/cifs/dfs.c +++ b/fs/cifs/dfs.c @@ -401,8 +401,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t if (ipc->need_reconnect) { scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls); - if (rc) - break; + cifs_dbg(FYI, "%s: reconnect ipc: %d\n", __func__, rc); } scnprintf(tree, MAX_TREE_SIZE, "\\%s", share); -- cgit From 775e44d6d86dca400d614cbda5dab4def4951fe7 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 29 Dec 2022 12:33:56 -0300 Subject: cifs: fix race in assemble_neg_contexts() Serialise access of TCP_Server_Info::hostname in assemble_neg_contexts() by holding the server's mutex otherwise it might end up accessing an already-freed hostname pointer from cifs_reconnect() or cifs_resolve_server(). Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a5695748a89b..2c484d47c592 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -541,9 +541,10 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { - char *pneg_ctxt; - char *hostname = NULL; unsigned int ctxt_len, neg_context_count; + struct TCP_Server_Info *pserver; + char *pneg_ctxt; + char *hostname; if (*total_len > 200) { /* In case length corrupted don't want to overrun smb buffer */ @@ -574,8 +575,9 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * secondary channels don't have the hostname field populated * use the hostname field in the primary channel instead */ - hostname = CIFS_SERVER_IS_CHAN(server) ? - server->primary_server->hostname : server->hostname; + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + cifs_server_lock(pserver); + hostname = pserver->hostname; if (hostname && (hostname[0] != 0)) { ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, hostname); @@ -584,6 +586,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, neg_context_count = 3; } else neg_context_count = 2; + cifs_server_unlock(pserver); build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); -- cgit From 558016722e9d5bc0ac79c246ccd14a8a4eb028d4 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Tue, 3 Jan 2023 14:09:41 -0800 Subject: MAINTAINERS: Update maintainers for ptp_vmw driver Vivek has decided to transfer the maintainership of the VMware virtual PTP clock driver (ptp_vmw) to Srivatsa and Deep. Update the MAINTAINERS file to reflect this change, and also add Alexey as a reviewer for the driver. Signed-off-by: Srivatsa S. Bhat (VMware) Acked-by: Vivek Thampi Acked-by: Deep Shah Acked-by: Alexey Makhalov Signed-off-by: David S. Miller --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7f0b7181e60a..758878c0eddf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22243,7 +22243,9 @@ F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER -M: Vivek Thampi +M: Srivatsa S. Bhat (VMware) +M: Deep Shah +R: Alexey Makhalov R: VMware PV-Drivers Reviewers L: netdev@vger.kernel.org S: Supported -- cgit From a664ec9158eeddd75121d39c9a0758016097fa96 Mon Sep 17 00:00:00 2001 From: Rodrigo Branco Date: Tue, 3 Jan 2023 14:17:51 -0600 Subject: x86/bugs: Flush IBP in ib_prctl_set() We missed the window between the TIF flag update and the next reschedule. Signed-off-by: Rodrigo Branco Reviewed-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/cpu/bugs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d970ddb0cc65..bca0bd8f4846 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1981,6 +1981,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); break; default: return -ERANGE; -- cgit From 340726747336716350eb5a928b860a29db955f05 Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Wed, 4 Jan 2023 10:07:37 +0000 Subject: memblock tests: Fix compilation error. Commit cf4694be2b2cf ("tools: Add atomic_test_and_set_bit()") changed tools/arch/x86/include/asm/atomic.h to include , which causes 'make -C tools/testing/memblock' to fail with: In file included from ../../include/asm/atomic.h:6, from ../../include/linux/atomic.h:5, from ./linux/mmzone.h:5, from ../../include/linux/mm.h:5, from ../../include/linux/pfn.h:5, from ./linux/memory_hotplug.h:6, from ./linux/init.h:7, from ./linux/memblock.h:11, from tests/common.h:8, from tests/basic_api.h:5, from main.c:2: ../../include/asm/../../arch/x86/include/asm/atomic.h:11:10: fatal error: asm/asm.h: No such file or directory 11 | #include | ^~~~~~~~~~~ Create a symlink to asm/asm.h in the same manner as the existing one to asm/cmpxchg.h. Signed-off-by: Aaron Thompson Link: https://lore.kernel.org/r/010101857c402765-96e2dbc6-b82b-47e2-a437-4834dbe0b96b-000000@us-west-2.amazonses.com Signed-off-by: Mike Rapoport (IBM) --- tools/testing/memblock/.gitignore | 1 + tools/testing/memblock/Makefile | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/memblock/.gitignore b/tools/testing/memblock/.gitignore index 654338e0be52..4cc7cd5aac2b 100644 --- a/tools/testing/memblock/.gitignore +++ b/tools/testing/memblock/.gitignore @@ -1,4 +1,5 @@ main memblock.c linux/memblock.h +asm/asm.h asm/cmpxchg.h diff --git a/tools/testing/memblock/Makefile b/tools/testing/memblock/Makefile index 2310ac4d080e..7a1ca694a982 100644 --- a/tools/testing/memblock/Makefile +++ b/tools/testing/memblock/Makefile @@ -29,13 +29,14 @@ include: ../../../include/linux/memblock.h ../../include/linux/*.h \ @mkdir -p linux test -L linux/memblock.h || ln -s ../../../../include/linux/memblock.h linux/memblock.h + test -L asm/asm.h || ln -s ../../../arch/x86/include/asm/asm.h asm/asm.h test -L asm/cmpxchg.h || ln -s ../../../arch/x86/include/asm/cmpxchg.h asm/cmpxchg.h memblock.c: $(EXTR_SRC) test -L memblock.c || ln -s $(EXTR_SRC) memblock.c clean: - $(RM) $(TARGETS) $(OFILES) linux/memblock.h memblock.c asm/cmpxchg.h + $(RM) $(TARGETS) $(OFILES) linux/memblock.h memblock.c asm/asm.h asm/cmpxchg.h help: @echo 'Memblock simulator' -- cgit From fa81ab49bbe4e1ce756581c970486de0ddb14309 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 16 Dec 2022 14:03:03 +0400 Subject: memblock: Fix doc for memblock_phys_free memblock_phys_free() is the counterpart to memblock_phys_alloc. Change memblock_alloc_xx() with memblock_phys_alloc_xx() to keep consistency. Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20221216100304.688209-1-linmq006@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index 511d4783dcf1..d036c7861310 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -836,7 +836,7 @@ void __init_memblock memblock_free(void *ptr, size_t size) * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * Free boot memory block previously allocated by memblock_alloc_xx() API. + * Free boot memory block previously allocated by memblock_phys_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) -- cgit From fb710ddee75fb96f50ee6d004ef777a0cf7ad5a3 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 28 Dec 2022 15:57:03 +0100 Subject: perf test record_probe_libc_inet_pton: Fix test on s/390 where 'text_to_binary_address' now appears on the backtrace perf test '84: probe libc's inet_pton & backtrace it with ping' fails on s390. Debugging revealed a changed stack trace for the ping command using probes: ping 35729 [002] 8006.365063: probe_libc:inet_pton: (3ff9603e7c0) 13e7c0 __GI___inet_pton+0x0 (/usr/lib64/libc.so.6) ---> 104371 text_to_binary_address+0xef1 (inlined) 104371 gaih_inet+0xef1 (inlined) 104371 __GI_getaddrinfo+0xef1 (inlined) 5d4b main+0x139b (/usr/bin/ping) The line "---> text_to_binary_address ..." is new. It was introduced with glibc version 2.36.7.2 released with Fedora 37 for s390. Output before # perf test inet_pton 84: probe libc's inet_pton & backtrace it with ping : FAILED! # Output after: # perf test inet_pton 84: probe libc's inet_pton & backtrace it with ping : Ok # Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20221228145704.2702487-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index 216b6b64caa3..57e7a6a470c9 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -37,6 +37,7 @@ trace_libc_inet_pton_backtrace() { case "$(uname -m)" in s390x) eventattr='call-graph=dwarf,max-stack=4' + echo "text_to_binary_address.*\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$" >> $expected echo "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$" >> $expected echo "(__GI_)?getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$" >> $expected echo "main\+0x[[:xdigit:]]+[[:space:]]\(.*/bin/ping.*\)$" >> $expected -- cgit From 2d656b0f81b22101db0447f890e39fdd736b745e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Jan 2023 22:44:01 -0800 Subject: perf stat: Fix handling of unsupported cgroup events when using BPF counters When --for-each-cgroup option is used, it fails when any of events is not supported and exits immediately. This is not how 'perf stat' handles unsupported events. Let's ignore the failure and proceed with others so that the output is similar to when BPF counters are not used: Before: $ sudo ./perf stat -a --bpf-counters -e L1-icache-loads,L1-dcache-loads --for-each-cgroup system.slice,user.slice sleep 1 Failed to open first cgroup events $ After it shows output similat to when --bpf-counters isn't specified: $ sudo ./perf stat -a --bpf-counters -e L1-icache-loads,L1-dcache-loads --for-each-cgroup system.slice,user.slice sleep 1 Performance counter stats for 'system wide': L1-icache-loads system.slice 29,892,418 L1-dcache-loads system.slice L1-icache-loads user.slice 52,497,220 L1-dcache-loads user.slice $ Fixes: 944138f048f7d759 ("perf stat: Enable BPF counter with --for-each-cgroup") Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20230104064402.1551516-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter_cgroup.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index 3c2df7522f6f..1c82377ed78b 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -116,27 +116,19 @@ static int bperf_load_program(struct evlist *evlist) /* open single copy of the events w/o cgroup */ err = evsel__open_per_cpu(evsel, evsel->core.cpus, -1); - if (err) { - pr_err("Failed to open first cgroup events\n"); - goto out; - } + if (err == 0) + evsel->supported = true; map_fd = bpf_map__fd(skel->maps.events); perf_cpu_map__for_each_cpu(cpu, j, evsel->core.cpus) { int fd = FD(evsel, j); __u32 idx = evsel->core.idx * total_cpus + cpu.cpu; - err = bpf_map_update_elem(map_fd, &idx, &fd, - BPF_ANY); - if (err < 0) { - pr_err("Failed to update perf_event fd\n"); - goto out; - } + bpf_map_update_elem(map_fd, &idx, &fd, BPF_ANY); } evsel->cgrp = leader_cgrp; } - evsel->supported = true; if (evsel->cgrp == cgrp) continue; -- cgit From 54b353a20c7e8be98414754f5aff98c8a68fcc1f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Jan 2023 22:44:02 -0800 Subject: perf stat: Fix handling of --for-each-cgroup with --bpf-counters to match non BPF mode The --for-each-cgroup can have the same cgroup multiple times, but this confuses BPF counters (since they have the same cgroup id), making only the last cgroup events to be counted. Let's check the cgroup name before adding a new entry to the cgroups list. Before: $ sudo ./perf stat -a --bpf-counters --for-each-cgroup /,/ sleep 1 Performance counter stats for 'system wide': msec cpu-clock / context-switches / cpu-migrations / page-faults / cycles / instructions / branches / branch-misses / 8,016.04 msec cpu-clock / # 7.998 CPUs utilized 6,152 context-switches / # 767.461 /sec 250 cpu-migrations / # 31.187 /sec 442 page-faults / # 55.139 /sec 613,111,487 cycles / # 0.076 GHz 280,599,604 instructions / # 0.46 insn per cycle 57,692,724 branches / # 7.197 M/sec 3,385,168 branch-misses / # 5.87% of all branches 1.002220125 seconds time elapsed After it becomes similar to the non-BPF mode: $ sudo ./perf stat -a --bpf-counters --for-each-cgroup /,/ sleep 1 Performance counter stats for 'system wide': 8,013.38 msec cpu-clock / # 7.998 CPUs utilized 6,859 context-switches / # 855.944 /sec 334 cpu-migrations / # 41.680 /sec 345 page-faults / # 43.053 /sec 782,326,119 cycles / # 0.098 GHz 471,645,724 instructions / # 0.60 insn per cycle 94,963,430 branches / # 11.851 M/sec 3,685,511 branch-misses / # 3.88% of all branches 1.001864539 seconds time elapsed Committer notes: As a reminder, to test with BPF counters one has to use BUILD_BPF_SKEL=1 in the make command line and have clang/llvm installed when building perf, otherwise the --bpf-counters option will not be available: # perf stat -a --bpf-counters --for-each-cgroup /,/ sleep 1 Error: unknown option `bpf-counters' Usage: perf stat [] [] -a, --all-cpus system-wide collection from all CPUs # Fixes: bb1c15b60b981d10 ("perf stat: Support regex pattern in --for-each-cgroup") Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: bpf@vger.kernel.org Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20230104064402.1551516-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cgroup.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index e99b41f9be45..cd978c240e0d 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -224,6 +224,19 @@ static int add_cgroup_name(const char *fpath, const struct stat *sb __maybe_unus return 0; } +static int check_and_add_cgroup_name(const char *fpath) +{ + struct cgroup_name *cn; + + list_for_each_entry(cn, &cgroup_list, list) { + if (!strcmp(cn->name, fpath)) + return 0; + } + + /* pretend if it's added by ftw() */ + return add_cgroup_name(fpath, NULL, FTW_D, NULL); +} + static void release_cgroup_list(void) { struct cgroup_name *cn; @@ -242,7 +255,7 @@ static int list_cgroups(const char *str) struct cgroup_name *cn; char *s; - /* use given name as is - for testing purpose */ + /* use given name as is when no regex is given */ for (;;) { p = strchr(str, ','); e = p ? p : eos; @@ -253,13 +266,13 @@ static int list_cgroups(const char *str) s = strndup(str, e - str); if (!s) return -1; - /* pretend if it's added by ftw() */ - ret = add_cgroup_name(s, NULL, FTW_D, NULL); + + ret = check_and_add_cgroup_name(s); free(s); - if (ret) + if (ret < 0) return -1; } else { - if (add_cgroup_name("", NULL, FTW_D, NULL) < 0) + if (check_and_add_cgroup_name("/") < 0) return -1; } -- cgit From 191f8453fc99a537ea78b727acea739782378b0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 07:48:37 -0700 Subject: ARM: renumber bits related to _TIF_WORK_MASK We want to ensure that the mask related to calling do_work_pending() is within the first 16 bits. Move bits unrelated to that outside of that range, to avoid spuriously calling do_work_pending() when we don't need to. Cc: stable@vger.kernel.org Fixes: 32d59773da38 ("arm: add support for TIF_NOTIFY_SIGNAL") Reported-and-tested-by: Hui Tang Suggested-by: Russell King (Oracle) Link: https://lore.kernel.org/lkml/7ecb8f3c-2aeb-a905-0d4a-aa768b9649b5@huawei.com/ Signed-off-by: Jens Axboe --- arch/arm/include/asm/thread_info.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index aecc403b2880..7f092cb55a41 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -128,15 +128,16 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ -#define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ +#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ -#define TIF_RESTORE_SIGMASK 20 +#define TIF_RESTORE_SIGMASK 19 +#define TIF_SYSCALL_TRACE 20 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 21 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 22 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP 23 /* seccomp syscall filtering active */ + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -- cgit From 39a154fc2d172a3a5865e5a9fa2a2983eb7a99ac Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 29 Dec 2022 18:43:46 -0300 Subject: cifs: protect access of TCP_Server_Info::{dstaddr,hostname} Use the appropriate locks to protect access of hostname and dstaddr fields in cifs_tree_connect() as they might get changed by other tasks. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/cifs/dfs.c | 22 +++++++++++----------- fs/cifs/misc.c | 2 ++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c index 30086f2060a1..b64d20374b9c 100644 --- a/fs/cifs/dfs.c +++ b/fs/cifs/dfs.c @@ -327,8 +327,8 @@ static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb return rc; } -static int target_share_matches_server(struct TCP_Server_Info *server, const char *tcp_host, - size_t tcp_host_len, char *share, bool *target_match) +static int target_share_matches_server(struct TCP_Server_Info *server, char *share, + bool *target_match) { int rc = 0; const char *dfs_host; @@ -338,13 +338,16 @@ static int target_share_matches_server(struct TCP_Server_Info *server, const cha extract_unc_hostname(share, &dfs_host, &dfs_host_len); /* Check if hostnames or addresses match */ - if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { - cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len, - dfs_host, (int)tcp_host_len, tcp_host); + cifs_server_lock(server); + if (dfs_host_len != strlen(server->hostname) || + strncasecmp(dfs_host, server->hostname, dfs_host_len)) { + cifs_dbg(FYI, "%s: %.*s doesn't match %s\n", __func__, + (int)dfs_host_len, dfs_host, server->hostname); rc = match_target_ip(server, dfs_host, dfs_host_len, target_match); if (rc) cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc); } + cifs_server_unlock(server); return rc; } @@ -358,13 +361,9 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t struct cifs_ses *root_ses = CIFS_DFS_ROOT_SES(tcon->ses); struct cifs_tcon *ipc = root_ses->tcon_ipc; char *share = NULL, *prefix = NULL; - const char *tcp_host; - size_t tcp_host_len; struct dfs_cache_tgt_iterator *tit; bool target_match; - extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); - tit = dfs_cache_get_tgt_iterator(tl); if (!tit) { rc = -ENOENT; @@ -387,8 +386,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t break; } - rc = target_share_matches_server(server, tcp_host, tcp_host_len, share, - &target_match); + rc = target_share_matches_server(server, share, &target_match); if (rc) break; if (!target_match) { @@ -497,7 +495,9 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru } if (tcon->ipc) { + cifs_server_lock(server); scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); + cifs_server_unlock(server); rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc); goto out; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 4d3c586785a5..2a19c7987c5b 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1277,7 +1277,9 @@ int match_target_ip(struct TCP_Server_Info *server, if (rc < 0) return rc; + spin_lock(&server->srv_lock); *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss); + spin_unlock(&server->srv_lock); cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result); return 0; } -- cgit From 3792fc508c095abd84b10ceae12bd773e61fdc36 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 15 Nov 2022 16:15:18 +0300 Subject: drm/i915: unpin on error in intel_vgpu_shadow_mm_pin() Call intel_vgpu_unpin_mm() on this error path. Fixes: 418741480809 ("drm/i915/gvt: Adding ppgtt to GVT GEM context after shadow pdps settled.") Signed-off-by: Dan Carpenter Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/Y3OQ5tgZIVxyQ/WV@kili Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/scheduler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 9cd8fcbf7cad..8009239935f7 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -695,6 +695,7 @@ intel_vgpu_shadow_mm_pin(struct intel_vgpu_workload *workload) if (workload->shadow_mm->type != INTEL_GVT_MM_PPGTT || !workload->shadow_mm->ppgtt_mm.shadowed) { + intel_vgpu_unpin_mm(workload->shadow_mm); gvt_vgpu_err("workload shadow ppgtt isn't ready\n"); return -EINVAL; } -- cgit From c4b850d1f448a901fbf4f7f36dec38c84009b489 Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Mon, 19 Dec 2022 22:03:56 +0800 Subject: drm/i915/gvt: fix gvt debugfs destroy When gvt debug fs is destroyed, need to have a sane check if drm minor's debugfs root is still available or not, otherwise in case like device remove through unbinding, drm minor's debugfs directory has already been removed, then intel_gvt_debugfs_clean() would act upon dangling pointer like below oops. i915 0000:00:02.0: Direct firmware load for i915/gvt/vid_0x8086_did_0x1926_rid_0x0a.golden_hw_state failed with error -2 i915 0000:00:02.0: MDEV: Registered Console: switching to colour dummy device 80x25 i915 0000:00:02.0: MDEV: Unregistering BUG: kernel NULL pointer dereference, address: 00000000000000a0 PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT SMP PTI CPU: 2 PID: 2486 Comm: gfx-unbind.sh Tainted: G I 6.1.0-rc8+ #15 Hardware name: Dell Inc. XPS 13 9350/0JXC1H, BIOS 1.13.0 02/10/2020 RIP: 0010:down_write+0x1f/0x90 Code: 1d ff ff 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 48 89 fb e8 62 c0 ff ff bf 01 00 00 00 e8 28 5e 31 ff 31 c0 ba 01 00 00 00 48 0f b1 13 75 33 65 48 8b 04 25 c0 bd 01 00 48 89 43 08 bf 01 RSP: 0018:ffff9eb3036ffcc8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000000000a0 RCX: ffffff8100000000 RDX: 0000000000000001 RSI: 0000000000000064 RDI: ffffffffa48787a8 RBP: ffff9eb3036ffd30 R08: ffffeb1fc45a0608 R09: ffffeb1fc45a05c0 R10: 0000000000000002 R11: 0000000000000000 R12: 0000000000000000 R13: ffff91acc33fa328 R14: ffff91acc033f080 R15: ffff91acced533e0 FS: 00007f6947bba740(0000) GS:ffff91ae36d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000a0 CR3: 00000001133a2002 CR4: 00000000003706e0 Call Trace: simple_recursive_removal+0x9f/0x2a0 ? start_creating.part.0+0x120/0x120 ? _raw_spin_lock+0x13/0x40 debugfs_remove+0x40/0x60 intel_gvt_debugfs_clean+0x15/0x30 [kvmgt] intel_gvt_clean_device+0x49/0xe0 [kvmgt] intel_gvt_driver_remove+0x2f/0xb0 i915_driver_remove+0xa4/0xf0 i915_pci_remove+0x1a/0x30 pci_device_remove+0x33/0xa0 device_release_driver_internal+0x1b2/0x230 unbind_store+0xe0/0x110 kernfs_fop_write_iter+0x11b/0x1f0 vfs_write+0x203/0x3d0 ksys_write+0x63/0xe0 do_syscall_64+0x37/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f6947cb5190 Code: 40 00 48 8b 15 71 9c 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 80 3d 51 24 0e 00 00 74 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 48 89 RSP: 002b:00007ffcbac45a28 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007f6947cb5190 RDX: 000000000000000d RSI: 0000555e35c866a0 RDI: 0000000000000001 RBP: 0000555e35c866a0 R08: 0000000000000002 R09: 0000555e358cb97c R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000001 R13: 000000000000000d R14: 0000000000000000 R15: 0000555e358cb8e0 Modules linked in: kvmgt CR2: 00000000000000a0 ---[ end trace 0000000000000000 ]--- Cc: Wang, Zhi Cc: He, Yu Cc: stable@vger.kernel.org Reviewed-by: Zhi Wang Fixes: bc7b0be316ae ("drm/i915/gvt: Add basic debugfs infrastructure") Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20221219140357.769557-1-zhenyuw@linux.intel.com --- drivers/gpu/drm/i915/gvt/debugfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/debugfs.c b/drivers/gpu/drm/i915/gvt/debugfs.c index 9f1c209d9251..d7df27feee8c 100644 --- a/drivers/gpu/drm/i915/gvt/debugfs.c +++ b/drivers/gpu/drm/i915/gvt/debugfs.c @@ -199,6 +199,10 @@ void intel_gvt_debugfs_init(struct intel_gvt *gvt) */ void intel_gvt_debugfs_clean(struct intel_gvt *gvt) { - debugfs_remove_recursive(gvt->debugfs_root); - gvt->debugfs_root = NULL; + struct drm_minor *minor = gvt->gt->i915->drm.primary; + + if (minor->debugfs_root) { + debugfs_remove_recursive(gvt->debugfs_root); + gvt->debugfs_root = NULL; + } } -- cgit From 704f3384f322b40ba24d958473edfb1c9750c8fd Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Mon, 19 Dec 2022 22:03:57 +0800 Subject: drm/i915/gvt: fix vgpu debugfs clean in remove Check carefully on root debugfs available when destroying vgpu, e.g in remove case drm minor's debugfs root might already be destroyed, which led to kernel oops like below. Console: switching to colour dummy device 80x25 i915 0000:00:02.0: MDEV: Unregistering intel_vgpu_mdev b1338b2d-a709-4c23-b766-cc436c36cdf0: Removing from iommu group 14 BUG: kernel NULL pointer dereference, address: 0000000000000150 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 3 PID: 1046 Comm: driverctl Not tainted 6.1.0-rc2+ #6 Hardware name: HP HP ProDesk 600 G3 MT/829D, BIOS P02 Ver. 02.44 09/13/2022 RIP: 0010:__lock_acquire+0x5e2/0x1f90 Code: 87 ad 09 00 00 39 05 e1 1e cc 02 0f 82 f1 09 00 00 ba 01 00 00 00 48 83 c4 48 89 d0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 45 31 ff <48> 81 3f 60 9e c2 b6 45 0f 45 f8 83 fe 01 0f 87 55 fa ff ff 89 f0 RSP: 0018:ffff9f770274f948 EFLAGS: 00010046 RAX: 0000000000000003 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000150 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8895d1173300 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000150 R14: 0000000000000000 R15: 0000000000000000 FS: 00007fc9b2ba0740(0000) GS:ffff889cdfcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000150 CR3: 000000010fd93005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: lock_acquire+0xbf/0x2b0 ? simple_recursive_removal+0xa5/0x2b0 ? lock_release+0x13d/0x2d0 down_write+0x2a/0xd0 ? simple_recursive_removal+0xa5/0x2b0 simple_recursive_removal+0xa5/0x2b0 ? start_creating.part.0+0x110/0x110 ? _raw_spin_unlock+0x29/0x40 debugfs_remove+0x40/0x60 intel_gvt_debugfs_remove_vgpu+0x15/0x30 [kvmgt] intel_gvt_destroy_vgpu+0x60/0x100 [kvmgt] intel_vgpu_release_dev+0xe/0x20 [kvmgt] device_release+0x30/0x80 kobject_put+0x79/0x1b0 device_release_driver_internal+0x1b8/0x230 bus_remove_device+0xec/0x160 device_del+0x189/0x400 ? up_write+0x9c/0x1b0 ? mdev_device_remove_common+0x60/0x60 [mdev] mdev_device_remove_common+0x22/0x60 [mdev] mdev_device_remove_cb+0x17/0x20 [mdev] device_for_each_child+0x56/0x80 mdev_unregister_parent+0x5a/0x81 [mdev] intel_gvt_clean_device+0x2d/0xe0 [kvmgt] intel_gvt_driver_remove+0x2e/0xb0 [i915] i915_driver_remove+0xac/0x100 [i915] i915_pci_remove+0x1a/0x30 [i915] pci_device_remove+0x31/0xa0 device_release_driver_internal+0x1b8/0x230 unbind_store+0xd8/0x100 kernfs_fop_write_iter+0x156/0x210 vfs_write+0x236/0x4a0 ksys_write+0x61/0xd0 do_syscall_64+0x55/0x80 ? find_held_lock+0x2b/0x80 ? lock_release+0x13d/0x2d0 ? up_read+0x17/0x20 ? lock_is_held_type+0xe3/0x140 ? asm_exc_page_fault+0x22/0x30 ? lockdep_hardirqs_on+0x7d/0x100 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fc9b2c9e0c4 Code: 15 71 7d 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 80 3d 3d 05 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 48 83 ec 28 48 89 54 24 18 48 RSP: 002b:00007ffec29c81c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007fc9b2c9e0c4 RDX: 000000000000000d RSI: 0000559f8b5f48a0 RDI: 0000000000000001 RBP: 0000559f8b5f48a0 R08: 0000559f8b5f3540 R09: 00007fc9b2d76d30 R10: 0000000000000000 R11: 0000000000000202 R12: 000000000000000d R13: 00007fc9b2d77780 R14: 000000000000000d R15: 00007fc9b2d72a00 Modules linked in: sunrpc intel_rapl_msr intel_rapl_common intel_pmc_core_pltdrv intel_pmc_core intel_tcc_cooling x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel ee1004 igbvf rapl vfat fat intel_cstate intel_uncore pktcdvd i2c_i801 pcspkr wmi_bmof i2c_smbus acpi_pad vfio_pci vfio_pci_core vfio_virqfd zram fuse dm_multipath kvmgt mdev vfio_iommu_type1 vfio kvm irqbypass i915 nvme e1000e igb nvme_core crct10dif_pclmul crc32_pclmul crc32c_intel polyval_clmulni polyval_generic serio_raw ghash_clmulni_intel sha512_ssse3 dca drm_buddy intel_gtt video wmi drm_display_helper ttm CR2: 0000000000000150 ---[ end trace 0000000000000000 ]--- Cc: Wang Zhi Cc: He Yu Cc: Alex Williamson Cc: stable@vger.kernel.org Reviewed-by: Zhi Wang Tested-by: Yu He Fixes: bc7b0be316ae ("drm/i915/gvt: Add basic debugfs infrastructure") Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20221219140357.769557-2-zhenyuw@linux.intel.com --- drivers/gpu/drm/i915/gvt/debugfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/debugfs.c b/drivers/gpu/drm/i915/gvt/debugfs.c index d7df27feee8c..e08ed0e9f165 100644 --- a/drivers/gpu/drm/i915/gvt/debugfs.c +++ b/drivers/gpu/drm/i915/gvt/debugfs.c @@ -175,8 +175,13 @@ void intel_gvt_debugfs_add_vgpu(struct intel_vgpu *vgpu) */ void intel_gvt_debugfs_remove_vgpu(struct intel_vgpu *vgpu) { - debugfs_remove_recursive(vgpu->debugfs); - vgpu->debugfs = NULL; + struct intel_gvt *gvt = vgpu->gvt; + struct drm_minor *minor = gvt->gt->i915->drm.primary; + + if (minor->debugfs_root && gvt->debugfs_root) { + debugfs_remove_recursive(vgpu->debugfs); + vgpu->debugfs = NULL; + } } /** -- cgit From a06d4b9e15c0ea4e05b200cfb1f1050e785a5e87 Mon Sep 17 00:00:00 2001 From: Zhi Wang Date: Thu, 10 Nov 2022 12:20:34 +0000 Subject: drm/i915/gvt: use atomic operations to change the vGPU status Several vGPU status are used to decide the availability of GVT-g core logics when creating a vGPU. Use atomic operations on changing the vGPU status to avoid the racing. Cc: Zhenyu Wang Cc: Kevin Tian Cc: Jason Gunthorpe Cc: intel-gvt-dev@lists.freedesktop.org Suggested-by: Alex Williamson Signed-off-by: Zhi Wang Reviewed-by: Zhenyu Wang Reviewed-by: Kevin Tian Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20221110122034.3382-2-zhi.a.wang@intel.com --- drivers/gpu/drm/i915/gvt/debugfs.c | 19 ++++++++++++++++++- drivers/gpu/drm/i915/gvt/dmabuf.c | 3 ++- drivers/gpu/drm/i915/gvt/gtt.c | 4 ++-- drivers/gpu/drm/i915/gvt/gvt.h | 15 ++++++++++----- drivers/gpu/drm/i915/gvt/interrupt.c | 2 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 35 +++++++++++++---------------------- drivers/gpu/drm/i915/gvt/scheduler.c | 3 ++- drivers/gpu/drm/i915/gvt/vgpu.c | 12 +++++------- 8 files changed, 53 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/debugfs.c b/drivers/gpu/drm/i915/gvt/debugfs.c index e08ed0e9f165..0616b73175f3 100644 --- a/drivers/gpu/drm/i915/gvt/debugfs.c +++ b/drivers/gpu/drm/i915/gvt/debugfs.c @@ -151,6 +151,22 @@ DEFINE_SIMPLE_ATTRIBUTE(vgpu_scan_nonprivbb_fops, vgpu_scan_nonprivbb_get, vgpu_scan_nonprivbb_set, "0x%llx\n"); +static int vgpu_status_get(void *data, u64 *val) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *)data; + + *val = 0; + + if (test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) + *val |= (1 << INTEL_VGPU_STATUS_ATTACHED); + if (test_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status)) + *val |= (1 << INTEL_VGPU_STATUS_ACTIVE); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vgpu_status_fops, vgpu_status_get, NULL, "0x%llx\n"); + /** * intel_gvt_debugfs_add_vgpu - register debugfs entries for a vGPU * @vgpu: a vGPU @@ -162,11 +178,12 @@ void intel_gvt_debugfs_add_vgpu(struct intel_vgpu *vgpu) snprintf(name, 16, "vgpu%d", vgpu->id); vgpu->debugfs = debugfs_create_dir(name, vgpu->gvt->debugfs_root); - debugfs_create_bool("active", 0444, vgpu->debugfs, &vgpu->active); debugfs_create_file("mmio_diff", 0444, vgpu->debugfs, vgpu, &vgpu_mmio_diff_fops); debugfs_create_file("scan_nonprivbb", 0644, vgpu->debugfs, vgpu, &vgpu_scan_nonprivbb_fops); + debugfs_create_file("status", 0644, vgpu->debugfs, vgpu, + &vgpu_status_fops); } /** diff --git a/drivers/gpu/drm/i915/gvt/dmabuf.c b/drivers/gpu/drm/i915/gvt/dmabuf.c index 355f1c0e8664..ffe41e9be04f 100644 --- a/drivers/gpu/drm/i915/gvt/dmabuf.c +++ b/drivers/gpu/drm/i915/gvt/dmabuf.c @@ -134,7 +134,8 @@ static void dmabuf_gem_object_free(struct kref *kref) struct list_head *pos; struct intel_vgpu_dmabuf_obj *dmabuf_obj; - if (vgpu && vgpu->active && !list_empty(&vgpu->dmabuf_obj_list_head)) { + if (vgpu && test_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status) && + !list_empty(&vgpu->dmabuf_obj_list_head)) { list_for_each(pos, &vgpu->dmabuf_obj_list_head) { dmabuf_obj = list_entry(pos, struct intel_vgpu_dmabuf_obj, list); if (dmabuf_obj == obj) { diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 51e5e8fb505b..6b4039010cae 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -55,7 +55,7 @@ static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn) int idx; bool ret; - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return false; idx = srcu_read_lock(&kvm->srcu); @@ -1178,7 +1178,7 @@ static int is_2MB_gtt_possible(struct intel_vgpu *vgpu, if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M)) return 0; - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return -EINVAL; pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry)); if (is_error_noslot_pfn(pfn)) diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 62823c0e13ab..2d65800d8e93 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -172,13 +172,18 @@ struct intel_vgpu_submission { #define KVMGT_DEBUGFS_FILENAME "kvmgt_nr_cache_entries" +enum { + INTEL_VGPU_STATUS_ATTACHED = 0, + INTEL_VGPU_STATUS_ACTIVE, + INTEL_VGPU_STATUS_NR_BITS, +}; + struct intel_vgpu { struct vfio_device vfio_device; struct intel_gvt *gvt; struct mutex vgpu_lock; int id; - bool active; - bool attached; + DECLARE_BITMAP(status, INTEL_VGPU_STATUS_NR_BITS); bool pv_notified; bool failsafe; unsigned int resetting_eng; @@ -467,7 +472,7 @@ void intel_vgpu_write_fence(struct intel_vgpu *vgpu, #define for_each_active_vgpu(gvt, vgpu, id) \ idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) \ - for_each_if(vgpu->active) + for_each_if(test_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status)) static inline void intel_vgpu_write_pci_bar(struct intel_vgpu *vgpu, u32 offset, u32 val, bool low) @@ -725,7 +730,7 @@ static inline bool intel_gvt_mmio_is_cmd_write_patch( static inline int intel_gvt_read_gpa(struct intel_vgpu *vgpu, unsigned long gpa, void *buf, unsigned long len) { - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return -ESRCH; return vfio_dma_rw(&vgpu->vfio_device, gpa, buf, len, false); } @@ -743,7 +748,7 @@ static inline int intel_gvt_read_gpa(struct intel_vgpu *vgpu, unsigned long gpa, static inline int intel_gvt_write_gpa(struct intel_vgpu *vgpu, unsigned long gpa, void *buf, unsigned long len) { - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return -ESRCH; return vfio_dma_rw(&vgpu->vfio_device, gpa, buf, len, true); } diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c index a6b2021b665f..68eca023bbc6 100644 --- a/drivers/gpu/drm/i915/gvt/interrupt.c +++ b/drivers/gpu/drm/i915/gvt/interrupt.c @@ -433,7 +433,7 @@ static int inject_virtual_interrupt(struct intel_vgpu *vgpu) * enabled by guest. so if msi_trigger is null, success is still * returned and don't inject interrupt into guest. */ - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return -ESRCH; if (vgpu->msi_trigger && eventfd_signal(vgpu->msi_trigger, 1) != 1) return -EFAULT; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index f5451adcd489..8ae7039b3683 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -638,7 +638,7 @@ static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu) mutex_lock(&vgpu->gvt->lock); for_each_active_vgpu(vgpu->gvt, itr, id) { - if (!itr->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, itr->status)) continue; if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) { @@ -655,9 +655,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) { struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); - if (vgpu->attached) - return -EEXIST; - if (!vgpu->vfio_device.kvm || vgpu->vfio_device.kvm->mm != current->mm) { gvt_vgpu_err("KVM is required to use Intel vGPU\n"); @@ -667,14 +664,14 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) if (__kvmgt_vgpu_exist(vgpu)) return -EEXIST; - vgpu->attached = true; - vgpu->track_node.track_write = kvmgt_page_track_write; vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; kvm_get_kvm(vgpu->vfio_device.kvm); kvm_page_track_register_notifier(vgpu->vfio_device.kvm, &vgpu->track_node); + set_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status); + debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs, &vgpu->nr_cache_entries); @@ -698,11 +695,10 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev) { struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); - if (!vgpu->attached) - return; - intel_gvt_release_vgpu(vgpu); + clear_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status); + debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs)); kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm, @@ -718,8 +714,6 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev) vgpu->dma_addr_cache = RB_ROOT; intel_vgpu_release_msi_eventfd_ctx(vgpu); - - vgpu->attached = false; } static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar) @@ -1512,9 +1506,6 @@ static void intel_vgpu_remove(struct mdev_device *mdev) { struct intel_vgpu *vgpu = dev_get_drvdata(&mdev->dev); - if (WARN_ON_ONCE(vgpu->attached)) - return; - vfio_unregister_group_dev(&vgpu->vfio_device); vfio_put_device(&vgpu->vfio_device); } @@ -1559,7 +1550,7 @@ int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) struct kvm_memory_slot *slot; int idx; - if (!info->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status)) return -ESRCH; idx = srcu_read_lock(&kvm->srcu); @@ -1589,8 +1580,8 @@ int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn) struct kvm_memory_slot *slot; int idx; - if (!info->attached) - return 0; + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status)) + return -ESRCH; idx = srcu_read_lock(&kvm->srcu); slot = gfn_to_memslot(kvm, gfn); @@ -1668,7 +1659,7 @@ int intel_gvt_dma_map_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, struct gvt_dma *entry; int ret; - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return -EINVAL; mutex_lock(&vgpu->cache_lock); @@ -1714,8 +1705,8 @@ int intel_gvt_dma_pin_guest_page(struct intel_vgpu *vgpu, dma_addr_t dma_addr) struct gvt_dma *entry; int ret = 0; - if (!vgpu->attached) - return -ENODEV; + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) + return -EINVAL; mutex_lock(&vgpu->cache_lock); entry = __gvt_cache_find_dma_addr(vgpu, dma_addr); @@ -1742,7 +1733,7 @@ void intel_gvt_dma_unmap_guest_page(struct intel_vgpu *vgpu, { struct gvt_dma *entry; - if (!vgpu->attached) + if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return; mutex_lock(&vgpu->cache_lock); @@ -1778,7 +1769,7 @@ static void intel_gvt_test_and_emulate_vblank(struct intel_gvt *gvt) idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) { if (test_and_clear_bit(INTEL_GVT_REQUEST_EMULATE_VBLANK + id, (void *)&gvt->service_request)) { - if (vgpu->active) + if (test_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status)) intel_vgpu_emulate_vblank(vgpu); } } diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index 8009239935f7..f4055804aad1 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -866,7 +866,8 @@ pick_next_workload(struct intel_gvt *gvt, struct intel_engine_cs *engine) goto out; } - if (!scheduler->current_vgpu->active || + if (!test_bit(INTEL_VGPU_STATUS_ACTIVE, + scheduler->current_vgpu->status) || list_empty(workload_q_head(scheduler->current_vgpu, engine))) goto out; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 3c529c2705dd..a5497440484f 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -166,9 +166,7 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) */ void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu) { - mutex_lock(&vgpu->vgpu_lock); - vgpu->active = true; - mutex_unlock(&vgpu->vgpu_lock); + set_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status); } /** @@ -183,7 +181,7 @@ void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu) { mutex_lock(&vgpu->vgpu_lock); - vgpu->active = false; + clear_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status); if (atomic_read(&vgpu->submission.running_workload_num)) { mutex_unlock(&vgpu->vgpu_lock); @@ -228,7 +226,8 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu) struct intel_gvt *gvt = vgpu->gvt; struct drm_i915_private *i915 = gvt->gt->i915; - drm_WARN(&i915->drm, vgpu->active, "vGPU is still active!\n"); + drm_WARN(&i915->drm, test_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status), + "vGPU is still active!\n"); /* * remove idr first so later clean can judge if need to stop @@ -285,8 +284,7 @@ struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt) if (ret) goto out_free_vgpu; - vgpu->active = false; - + clear_bit(INTEL_VGPU_STATUS_ACTIVE, vgpu->status); return vgpu; out_free_vgpu: -- cgit From 4a61648af68f5ba4884f0e3b494ee1cabc4b6620 Mon Sep 17 00:00:00 2001 From: Zheng Wang Date: Fri, 30 Dec 2022 00:56:41 +0800 Subject: drm/i915/gvt: fix double free bug in split_2MB_gtt_entry If intel_gvt_dma_map_guest_page failed, it will call ppgtt_invalidate_spt, which will finally free the spt. But the caller function ppgtt_populate_spt_by_guest_entry does not notice that, it will free spt again in its error path. Fix this by canceling the mapping of DMA address and freeing sub_spt. Besides, leave the handle of spt destroy to caller function instead of callee function when error occurs. Fixes: b901b252b6cf ("drm/i915/gvt: Add 2M huge gtt support") Signed-off-by: Zheng Wang Reviewed-by: Zhenyu Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20221229165641.1192455-1-zyytlz.wz@163.com --- drivers/gpu/drm/i915/gvt/gtt.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 6b4039010cae..4ec85308379a 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -1209,10 +1209,8 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu, for_each_shadow_entry(sub_spt, &sub_se, sub_index) { ret = intel_gvt_dma_map_guest_page(vgpu, start_gfn + sub_index, PAGE_SIZE, &dma_addr); - if (ret) { - ppgtt_invalidate_spt(spt); - return ret; - } + if (ret) + goto err; sub_se.val64 = se->val64; /* Copy the PAT field from PDE. */ @@ -1231,6 +1229,17 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu, ops->set_pfn(se, sub_spt->shadow_page.mfn); ppgtt_set_shadow_entry(spt, se, index); return 0; +err: + /* Cancel the existing addess mappings of DMA addr. */ + for_each_present_shadow_entry(sub_spt, &sub_se, sub_index) { + gvt_vdbg_mm("invalidate 4K entry\n"); + ppgtt_invalidate_pte(sub_spt, &sub_se); + } + /* Release the new allocated spt. */ + trace_spt_change(sub_spt->vgpu->id, "release", sub_spt, + sub_spt->guest_page.gfn, sub_spt->shadow_page.type); + ppgtt_free_spt(sub_spt); + return ret; } static int split_64KB_gtt_entry(struct intel_vgpu *vgpu, -- cgit From 613b14884b8595e20b9fac4126bf627313827fbe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 08:51:19 -0700 Subject: block: handle bio_split_to_limits() NULL return This can't happen right now, but in preparation for allowing bio_split_to_limits() returning NULL if it ended the bio, check for it in all the callers. Signed-off-by: Jens Axboe --- block/blk-merge.c | 4 +++- block/blk-mq.c | 5 ++++- drivers/block/drbd/drbd_req.c | 2 ++ drivers/block/ps3vram.c | 2 ++ drivers/md/dm.c | 2 ++ drivers/md/md.c | 2 ++ drivers/nvme/host/multipath.c | 2 ++ drivers/s390/block/dcssblk.c | 2 ++ 8 files changed, 19 insertions(+), 2 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 35a8f75cc45d..071c5f8cf0cf 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -358,11 +358,13 @@ struct bio *__bio_split_to_limits(struct bio *bio, default: split = bio_split_rw(bio, lim, nr_segs, bs, get_max_io_size(bio, lim) << SECTOR_SHIFT); + if (IS_ERR(split)) + return NULL; break; } if (split) { - /* there isn't chance to merge the splitted bio */ + /* there isn't chance to merge the split bio */ split->bi_opf |= REQ_NOMERGE; blkcg_bio_issue_init(split); diff --git a/block/blk-mq.c b/block/blk-mq.c index c5cf0dbca1db..2c49b4151da1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2951,8 +2951,11 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (bio_may_exceed_limits(bio, &q->limits)) + if (bio_may_exceed_limits(bio, &q->limits)) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } if (!bio_integrity_prep(bio)) return; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index eb14ec8ec04c..e36216d50753 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1607,6 +1607,8 @@ void drbd_submit_bio(struct bio *bio) struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; bio = bio_split_to_limits(bio); + if (!bio) + return; /* * what we "blindly" assume: diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index c76e0148eada..574e470b220b 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -587,6 +587,8 @@ static void ps3vram_submit_bio(struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); bio = bio_split_to_limits(bio); + if (!bio) + return; spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e1ea3a7bd9d9..b424a6ee27ba 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1742,6 +1742,8 @@ static void dm_split_and_process_bio(struct mapped_device *md, * otherwise associated queue_limits won't be imposed. */ bio = bio_split_to_limits(bio); + if (!bio) + return; } init_clone_info(&ci, md, map, bio, is_abnormal); diff --git a/drivers/md/md.c b/drivers/md/md.c index 775f1dde190a..8af639296b3c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -455,6 +455,8 @@ static void md_submit_bio(struct bio *bio) } bio = bio_split_to_limits(bio); + if (!bio) + return; if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index c03093b6813c..fc39d01e7b63 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -376,6 +376,8 @@ static void nvme_ns_head_submit_bio(struct bio *bio) * pool from the original queue to allocate the bvecs from. */ bio = bio_split_to_limits(bio); + if (!bio) + return; srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index b392b9f5482e..c0f85ffb2b62 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -865,6 +865,8 @@ dcssblk_submit_bio(struct bio *bio) unsigned long bytes_done; bio = bio_split_to_limits(bio); + if (!bio) + return; bytes_done = 0; dev_info = bio->bi_bdev->bd_disk->private_data; -- cgit From 481028dbf1daa2808e1be06f6a865b5fe5939efc Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Wed, 4 Jan 2023 11:34:14 -0800 Subject: perf tools: Fix build on uClibc systems by adding missing sys/types.h include Not all libc implementations define ssize_t as part of stdio.h like glibc does since the standard only requires this type to be defined by unistd.h and sys/types.h. For this reason the perf build is currently broken for toolchains based on uClibc, for instance. Include sys/types.h explicitly to fix that. Committer notes: In addition, in the past this worked in uClibc test systems as there was another way to get to sys/types.h that got removed in that cset: tools/perf/util/trace-event.h /usr/include/traceevent/event_parse.h # This got removed from util/trace-event.h in 378ef0f5d9d7f465 /usr/include/regex.h /usr/include/sys/types.h typedef __ssize_t ssize_t; So the size_t that is used in tools/perf/util/trace-event.h was being obtained indirectly, by chance. Fixes: 378ef0f5d9d7f465 ("perf build: Use libtraceevent from the system") Signed-off-by: Jesus Sanchez-Palencia Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20230104193414.606905-1-jesussanp@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/trace-event.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index add6c5d9531c..9b3cd79cca12 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -4,6 +4,7 @@ #include #include +#include #include struct evlist; -- cgit From f52853a668bfeddd79f319d536a506f68cc2b478 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Jan 2023 22:58:30 +0800 Subject: perf/x86/rapl: Add support for Intel Meteor Lake Meteor Lake RAPL support is the same as previous Sky Lake. Add Meteor Lake model for RAPL. Signed-off-by: Zhang Rui Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230104145831.25498-1-rui.zhang@intel.com --- arch/x86/events/rapl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index ae5779ea4417..589c6885560d 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -809,6 +809,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); -- cgit From 57512b57dcfaf63c52d8ad2fb35321328cde31b0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Jan 2023 22:58:31 +0800 Subject: perf/x86/rapl: Add support for Intel Emerald Rapids Emerald Rapids RAPL support is the same as previous Sapphire Rapids. Add Emerald Rapids model for RAPL. Signed-off-by: Zhang Rui Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230104145831.25498-2-rui.zhang@intel.com --- arch/x86/events/rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 589c6885560d..52e6e7ed4f78 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -806,6 +806,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), -- cgit From 9cea62b2cbabff8ed46f2df17778b624ad9dd25a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 08:52:06 -0700 Subject: block: don't allow splitting of a REQ_NOWAIT bio If we split a bio marked with REQ_NOWAIT, then we can trigger spurious EAGAIN if constituent parts of that split bio end up failing request allocations. Parts will complete just fine, but just a single failure in one of the chained bios will yield an EAGAIN final result for the parent bio. Return EAGAIN early if we end up needing to split such a bio, which allows for saner recovery handling. Cc: stable@vger.kernel.org # 5.15+ Link: https://github.com/axboe/liburing/issues/766 Reported-by: Michael Kelley Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-merge.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 071c5f8cf0cf..b7c193d67185 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -309,6 +309,16 @@ static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, *segs = nsegs; return NULL; split: + /* + * We can't sanely support splitting for a REQ_NOWAIT bio. End it + * with EAGAIN if splitting is required and return an error pointer. + */ + if (bio->bi_opf & REQ_NOWAIT) { + bio->bi_status = BLK_STS_AGAIN; + bio_endio(bio); + return ERR_PTR(-EAGAIN); + } + *segs = nsegs; /* -- cgit From fa8e442e832a3647cdd90f3e606c473a51bc1b26 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 4 Jan 2023 21:32:35 +0800 Subject: ublk: honor IO_URING_F_NONBLOCK for handling control command Most of control command handlers may sleep, so return -EAGAIN in case of IO_URING_F_NONBLOCK to defer the handling into io wq context. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reported-by: Jens Axboe Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230104133235.836536-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e9de9d846b73..17b677b5d3b2 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1992,6 +1992,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ret = -EINVAL; + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + ublk_ctrl_cmd_dump(cmd); if (!(issue_flags & IO_URING_F_SQE128)) -- cgit From 59b745bb4e0bd445366c45b8df6b51b69134f4f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 13:49:54 -0700 Subject: io_uring: move 'poll_multi_queue' bool in io_ring_ctx The cacheline section holding this variable has two gaps, where one is caused by this bool not packing well with structs. This causes it to blow into the next cacheline. Move the variable, shrinking io_ring_ctx by a full cacheline in size. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dcd8a563ab52..128a67a40065 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -292,6 +292,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + bool poll_multi_queue; + /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -300,7 +302,6 @@ struct io_ring_ctx { */ struct io_wq_work_list iopoll_list; struct io_hash_table cancel_table; - bool poll_multi_queue; struct llist_head work_llist; -- cgit From ee4b4e2248565babfba807d82c0f3e00c392a4c0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 14:43:27 -0700 Subject: Revert "block: bio_copy_data_iter" This reverts commit db1c7d77976775483a8ef240b4c705f113e13ea1. We're reinstating the pktcdvd driver, which needs this API. Signed-off-by: Jens Axboe --- block/bio.c | 37 ++++++++++++++++++++++--------------- include/linux/bio.h | 2 ++ 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/block/bio.c b/block/bio.c index 5f96fcae3f75..ab59a491a883 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1401,6 +1401,27 @@ void __bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(__bio_advance); +void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter) +{ + while (src_iter->bi_size && dst_iter->bi_size) { + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf = bvec_kmap_local(&src_bv); + void *dst_buf = bvec_kmap_local(&dst_bv); + + memcpy(dst_buf, src_buf, bytes); + + kunmap_local(dst_buf); + kunmap_local(src_buf); + + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); + } +} +EXPORT_SYMBOL(bio_copy_data_iter); + /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio @@ -1414,21 +1435,7 @@ void bio_copy_data(struct bio *dst, struct bio *src) struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; - while (src_iter.bi_size && dst_iter.bi_size) { - struct bio_vec src_bv = bio_iter_iovec(src, src_iter); - struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter); - unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); - void *src_buf = bvec_kmap_local(&src_bv); - void *dst_buf = bvec_kmap_local(&dst_bv); - - memcpy(dst_buf, src_buf, bytes); - - kunmap_local(dst_buf); - kunmap_local(src_buf); - - bio_advance_iter_single(src, &src_iter, bytes); - bio_advance_iter_single(dst, &dst_iter, bytes); - } + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } EXPORT_SYMBOL(bio_copy_data); diff --git a/include/linux/bio.h b/include/linux/bio.h index 22078a28d7cb..c1da63f6c808 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,6 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -- cgit From 050a4f341f35bf51db321c7f68700f9e0b1a7552 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 14:44:02 -0700 Subject: Revert "block: remove devnode callback from struct block_device_operations" This reverts commit 85d6ce58e493ac8b7122e2fbe3f41b94d6ebdc11. We're reinstating the pktcdvd driver, which needs this API. Signed-off-by: Jens Axboe --- block/genhd.c | 11 +++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 12 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index 08f76135a637..14329dc278b2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1201,10 +1201,21 @@ struct class block_class = { .dev_uevent = block_uevent, }; +static char *block_devnode(struct device *dev, umode_t *mode, + kuid_t *uid, kgid_t *gid) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); + return NULL; +} + const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, + .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 301cf1cf4f2f..43d4e073b111 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1395,6 +1395,7 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); + char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); -- cgit From 4b83e99ee7092df37a5cf292fde976ebc475ea63 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jan 2023 14:44:13 -0700 Subject: Revert "pktcdvd: remove driver." MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f40eb99897af665f11858dd7b56edcb62c3f3c67. There are apparently still users out there of this driver. While we'd love to remove it to ease the maintenance burden, let's reinstate it for now until better (userspace) solutions can be developed. Link: https://lore.kernel.org/lkml/20230104190115.ceglfefco475ev6c@pali/ Reported-by: Pali Rohár Signed-off-by: Jens Axboe --- Documentation/ABI/testing/debugfs-pktcdvd | 18 + Documentation/ABI/testing/sysfs-class-pktcdvd | 97 + MAINTAINERS | 7 + drivers/block/Kconfig | 43 + drivers/block/Makefile | 1 + drivers/block/pktcdvd.c | 2944 +++++++++++++++++++++++++ include/linux/pktcdvd.h | 197 ++ include/uapi/linux/pktcdvd.h | 112 + 8 files changed, 3419 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-pktcdvd create mode 100644 Documentation/ABI/testing/sysfs-class-pktcdvd create mode 100644 drivers/block/pktcdvd.c create mode 100644 include/linux/pktcdvd.h create mode 100644 include/uapi/linux/pktcdvd.h diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd new file mode 100644 index 000000000000..f6f65a4faea0 --- /dev/null +++ b/Documentation/ABI/testing/debugfs-pktcdvd @@ -0,0 +1,18 @@ +What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7] +Date: Oct. 2006 +KernelVersion: 2.6.20 +Contact: Thomas Maier +Description: + +The pktcdvd module (packet writing driver) creates +these files in debugfs: + +/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/ + + ==== ====== ==================================== + info 0444 Lots of driver statistics and infos. + ==== ====== ==================================== + +Example:: + + cat /sys/kernel/debug/pktcdvd/pktcdvd0/info diff --git a/Documentation/ABI/testing/sysfs-class-pktcdvd b/Documentation/ABI/testing/sysfs-class-pktcdvd new file mode 100644 index 000000000000..ba1ce626591d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-pktcdvd @@ -0,0 +1,97 @@ +sysfs interface +--------------- +The pktcdvd module (packet writing driver) creates the following files in the +sysfs: ( is in the format major:minor) + +What: /sys/class/pktcdvd/add +What: /sys/class/pktcdvd/remove +What: /sys/class/pktcdvd/device_map +Date: Oct. 2006 +KernelVersion: 2.6.20 +Contact: Thomas Maier +Description: + + ========== ============================================== + add (WO) Write a block device id (major:minor) to + create a new pktcdvd device and map it to the + block device. + + remove (WO) Write the pktcdvd device id (major:minor) + to remove the pktcdvd device. + + device_map (RO) Shows the device mapping in format: + pktcdvd[0-7] + ========== ============================================== + + +What: /sys/class/pktcdvd/pktcdvd[0-7]/dev +What: /sys/class/pktcdvd/pktcdvd[0-7]/uevent +Date: Oct. 2006 +KernelVersion: 2.6.20 +Contact: Thomas Maier +Description: + dev: (RO) Device id + + uevent: (WO) To send a uevent + + +What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started +What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished +What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written +What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read +What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather +What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/reset +Date: Oct. 2006 +KernelVersion: 2.6.20 +Contact: Thomas Maier +Description: + packets_started: (RO) Number of started packets. + + packets_finished: (RO) Number of finished packets. + + kb_written: (RO) kBytes written. + + kb_read: (RO) kBytes read. + + kb_read_gather: (RO) kBytes read to fill write packets. + + reset: (WO) Write any value to it to reset + pktcdvd device statistic values, like + bytes read/written. + + +What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size +What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off +What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on +Date: Oct. 2006 +KernelVersion: 2.6.20 +Contact: Thomas Maier +Description: + ============== ================================================ + size (RO) Contains the size of the bio write queue. + + congestion_off (RW) If bio write queue size is below this mark, + accept new bio requests from the block layer. + + congestion_on (RW) If bio write queue size is higher as this + mark, do no longer accept bio write requests + from the block layer and wait till the pktcdvd + device has processed enough bio's so that bio + write queue size is below congestion off mark. + A value of <= 0 disables congestion control. + ============== ================================================ + + +Example: +-------- +To use the pktcdvd sysfs interface directly, you can do:: + + # create a new pktcdvd device mapped to /dev/hdc + echo "22:0" >/sys/class/pktcdvd/add + cat /sys/class/pktcdvd/device_map + # assuming device pktcdvd0 was created, look at stat's + cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written + # print the device id of the mapped block device + fgrep pktcdvd0 /sys/class/pktcdvd/device_map + # remove device, using pktcdvd0 device id 253:0 + echo "253:0" >/sys/class/pktcdvd/remove diff --git a/MAINTAINERS b/MAINTAINERS index d53b3a6cdc67..3ef137fea4f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16520,6 +16520,13 @@ S: Supported F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml F: drivers/input/keyboard/pinephone-keyboard.c +PKTCDVD DRIVER +M: linux-block@vger.kernel.org +S: Orphan +F: drivers/block/pktcdvd.c +F: include/linux/pktcdvd.h +F: include/uapi/linux/pktcdvd.h + PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER M: Tomasz Duszynski S: Maintained diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a2184b428493..a41145d52de9 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -285,6 +285,49 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. +config CDROM_PKTCDVD + tristate "Packet writing on CD/DVD media (DEPRECATED)" + depends on !UML + depends on SCSI + select CDROM + help + Note: This driver is deprecated and will be removed from the + kernel in the near future! + + If you have a CDROM/DVD drive that supports packet writing, say + Y to include support. It should work with any MMC/Mt Fuji + compliant ATAPI or SCSI drive, which is just about any newer + DVD/CD writer. + + Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs + is possible. + DVD-RW disks must be in restricted overwrite mode. + + See the file + for further information on the use of this driver. + + To compile this driver as a module, choose M here: the + module will be called pktcdvd. + +config CDROM_PKTCDVD_BUFFERS + int "Free buffers for data gathering" + depends on CDROM_PKTCDVD + default "8" + help + This controls the maximum number of active concurrent packets. More + concurrent packets can increase write performance, but also require + more memory. Each concurrent packet will require approximately 64Kb + of non-swappable kernel memory, memory which will be allocated when + a disc is opened for writing. + +config CDROM_PKTCDVD_WCACHE + bool "Enable write caching" + depends on CDROM_PKTCDVD + help + If enabled, write caching will be set for the CD-R/W device. For now + this option is dangerous unless the CD-RW media is known good, as we + don't do deferred write error handling yet. + config ATA_OVER_ETH tristate "ATA over Ethernet support" depends on NET diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 962ee65d8ca3..101612cba303 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_N64CART) += n64cart.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o +obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c new file mode 100644 index 000000000000..4cea3b08087e --- /dev/null +++ b/drivers/block/pktcdvd.c @@ -0,0 +1,2944 @@ +/* + * Copyright (C) 2000 Jens Axboe + * Copyright (C) 2001-2004 Peter Osterlund + * Copyright (C) 2006 Thomas Maier + * + * May be copied or modified under the terms of the GNU General Public + * License. See linux/COPYING for more information. + * + * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and + * DVD-RAM devices. + * + * Theory of operation: + * + * At the lowest level, there is the standard driver for the CD/DVD device, + * such as drivers/scsi/sr.c. This driver can handle read and write requests, + * but it doesn't know anything about the special restrictions that apply to + * packet writing. One restriction is that write requests must be aligned to + * packet boundaries on the physical media, and the size of a write request + * must be equal to the packet size. Another restriction is that a + * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read + * command, if the previous command was a write. + * + * The purpose of the packet writing driver is to hide these restrictions from + * higher layers, such as file systems, and present a block device that can be + * randomly read and written using 2kB-sized blocks. + * + * The lowest layer in the packet writing driver is the packet I/O scheduler. + * Its data is defined by the struct packet_iosched and includes two bio + * queues with pending read and write requests. These queues are processed + * by the pkt_iosched_process_queue() function. The write requests in this + * queue are already properly aligned and sized. This layer is responsible for + * issuing the flush cache commands and scheduling the I/O in a good order. + * + * The next layer transforms unaligned write requests to aligned writes. This + * transformation requires reading missing pieces of data from the underlying + * block device, assembling the pieces to full packets and queuing them to the + * packet I/O scheduler. + * + * At the top layer there is a custom ->submit_bio function that forwards + * read requests directly to the iosched queue and puts write requests in the + * unaligned write queue. A kernel thread performs the necessary read + * gathering to convert the unaligned writes to aligned writes and then feeds + * them to the packet I/O scheduler. + * + *************************************************************************/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "pktcdvd" + +#define pkt_err(pd, fmt, ...) \ + pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_notice(pd, fmt, ...) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_info(pd, fmt, ...) \ + pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) + +#define pkt_dbg(level, pd, fmt, ...) \ +do { \ + if (level == 2 && PACKET_DEBUG >= 2) \ + pr_notice("%s: %s():" fmt, \ + pd->name, __func__, ##__VA_ARGS__); \ + else if (level == 1 && PACKET_DEBUG >= 1) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ +} while (0) + +#define MAX_SPEED 0xffff + +static DEFINE_MUTEX(pktcdvd_mutex); +static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; +static struct proc_dir_entry *pkt_proc; +static int pktdev_major; +static int write_congestion_on = PKT_WRITE_CONGESTION_ON; +static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; +static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ +static mempool_t psd_pool; +static struct bio_set pkt_bio_set; + +static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ +static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ + +/* forward declaration */ +static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); +static int pkt_remove_dev(dev_t pkt_dev); +static int pkt_seq_show(struct seq_file *m, void *p); + +static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) +{ + return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); +} + +/********************************************************** + * sysfs interface for pktcdvd + * by (C) 2006 Thomas Maier + + /sys/class/pktcdvd/pktcdvd[0-7]/ + stat/reset + stat/packets_started + stat/packets_finished + stat/kb_written + stat/kb_read + stat/kb_read_gather + write_queue/size + write_queue/congestion_off + write_queue/congestion_on + **********************************************************/ + +static ssize_t packets_started_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started); +} +static DEVICE_ATTR_RO(packets_started); + +static ssize_t packets_finished_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended); +} +static DEVICE_ATTR_RO(packets_finished); + +static ssize_t kb_written_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1); +} +static DEVICE_ATTR_RO(kb_written); + +static ssize_t kb_read_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1); +} +static DEVICE_ATTR_RO(kb_read); + +static ssize_t kb_read_gather_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1); +} +static DEVICE_ATTR_RO(kb_read_gather); + +static ssize_t reset_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + + if (len > 0) { + pd->stats.pkt_started = 0; + pd->stats.pkt_ended = 0; + pd->stats.secs_w = 0; + pd->stats.secs_rg = 0; + pd->stats.secs_r = 0; + } + return len; +} +static DEVICE_ATTR_WO(reset); + +static struct attribute *pkt_stat_attrs[] = { + &dev_attr_packets_finished.attr, + &dev_attr_packets_started.attr, + &dev_attr_kb_read.attr, + &dev_attr_kb_written.attr, + &dev_attr_kb_read_gather.attr, + &dev_attr_reset.attr, + NULL, +}; + +static const struct attribute_group pkt_stat_group = { + .name = "stat", + .attrs = pkt_stat_attrs, +}; + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->bio_queue_size); + spin_unlock(&pd->lock); + return n; +} +static DEVICE_ATTR_RO(size); + +static void init_write_congestion_marks(int* lo, int* hi) +{ + if (*hi > 0) { + *hi = max(*hi, 500); + *hi = min(*hi, 1000000); + if (*lo <= 0) + *lo = *hi - 100; + else { + *lo = min(*lo, *hi - 100); + *lo = max(*lo, 100); + } + } else { + *hi = -1; + *lo = -1; + } +} + +static ssize_t congestion_off_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->write_congestion_off); + spin_unlock(&pd->lock); + return n; +} + +static ssize_t congestion_off_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int val; + + if (sscanf(buf, "%d", &val) == 1) { + spin_lock(&pd->lock); + pd->write_congestion_off = val; + init_write_congestion_marks(&pd->write_congestion_off, + &pd->write_congestion_on); + spin_unlock(&pd->lock); + } + return len; +} +static DEVICE_ATTR_RW(congestion_off); + +static ssize_t congestion_on_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int n; + + spin_lock(&pd->lock); + n = sysfs_emit(buf, "%d\n", pd->write_congestion_on); + spin_unlock(&pd->lock); + return n; +} + +static ssize_t congestion_on_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pktcdvd_device *pd = dev_get_drvdata(dev); + int val; + + if (sscanf(buf, "%d", &val) == 1) { + spin_lock(&pd->lock); + pd->write_congestion_on = val; + init_write_congestion_marks(&pd->write_congestion_off, + &pd->write_congestion_on); + spin_unlock(&pd->lock); + } + return len; +} +static DEVICE_ATTR_RW(congestion_on); + +static struct attribute *pkt_wq_attrs[] = { + &dev_attr_congestion_on.attr, + &dev_attr_congestion_off.attr, + &dev_attr_size.attr, + NULL, +}; + +static const struct attribute_group pkt_wq_group = { + .name = "write_queue", + .attrs = pkt_wq_attrs, +}; + +static const struct attribute_group *pkt_groups[] = { + &pkt_stat_group, + &pkt_wq_group, + NULL, +}; + +static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) +{ + if (class_pktcdvd) { + pd->dev = device_create_with_groups(class_pktcdvd, NULL, + MKDEV(0, 0), pd, pkt_groups, + "%s", pd->name); + if (IS_ERR(pd->dev)) + pd->dev = NULL; + } +} + +static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) +{ + if (class_pktcdvd) + device_unregister(pd->dev); +} + + +/******************************************************************** + /sys/class/pktcdvd/ + add map block device + remove unmap packet dev + device_map show mappings + *******************************************************************/ + +static void class_pktcdvd_release(struct class *cls) +{ + kfree(cls); +} + +static ssize_t device_map_show(struct class *c, struct class_attribute *attr, + char *data) +{ + int n = 0; + int idx; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + for (idx = 0; idx < MAX_WRITERS; idx++) { + struct pktcdvd_device *pd = pkt_devs[idx]; + if (!pd) + continue; + n += sprintf(data+n, "%s %u:%u %u:%u\n", + pd->name, + MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), + MAJOR(pd->bdev->bd_dev), + MINOR(pd->bdev->bd_dev)); + } + mutex_unlock(&ctl_mutex); + return n; +} +static CLASS_ATTR_RO(device_map); + +static ssize_t add_store(struct class *c, struct class_attribute *attr, + const char *buf, size_t count) +{ + unsigned int major, minor; + + if (sscanf(buf, "%u:%u", &major, &minor) == 2) { + /* pkt_setup_dev() expects caller to hold reference to self */ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + pkt_setup_dev(MKDEV(major, minor), NULL); + + module_put(THIS_MODULE); + + return count; + } + + return -EINVAL; +} +static CLASS_ATTR_WO(add); + +static ssize_t remove_store(struct class *c, struct class_attribute *attr, + const char *buf, size_t count) +{ + unsigned int major, minor; + if (sscanf(buf, "%u:%u", &major, &minor) == 2) { + pkt_remove_dev(MKDEV(major, minor)); + return count; + } + return -EINVAL; +} +static CLASS_ATTR_WO(remove); + +static struct attribute *class_pktcdvd_attrs[] = { + &class_attr_add.attr, + &class_attr_remove.attr, + &class_attr_device_map.attr, + NULL, +}; +ATTRIBUTE_GROUPS(class_pktcdvd); + +static int pkt_sysfs_init(void) +{ + int ret = 0; + + /* + * create control files in sysfs + * /sys/class/pktcdvd/... + */ + class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL); + if (!class_pktcdvd) + return -ENOMEM; + class_pktcdvd->name = DRIVER_NAME; + class_pktcdvd->owner = THIS_MODULE; + class_pktcdvd->class_release = class_pktcdvd_release; + class_pktcdvd->class_groups = class_pktcdvd_groups; + ret = class_register(class_pktcdvd); + if (ret) { + kfree(class_pktcdvd); + class_pktcdvd = NULL; + pr_err("failed to create class pktcdvd\n"); + return ret; + } + return 0; +} + +static void pkt_sysfs_cleanup(void) +{ + if (class_pktcdvd) + class_destroy(class_pktcdvd); + class_pktcdvd = NULL; +} + +/******************************************************************** + entries in debugfs + + /sys/kernel/debug/pktcdvd[0-7]/ + info + + *******************************************************************/ + +static int pkt_debugfs_seq_show(struct seq_file *m, void *p) +{ + return pkt_seq_show(m, p); +} + +static int pkt_debugfs_fops_open(struct inode *inode, struct file *file) +{ + return single_open(file, pkt_debugfs_seq_show, inode->i_private); +} + +static const struct file_operations debug_fops = { + .open = pkt_debugfs_fops_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; + +static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) +{ + if (!pkt_debugfs_root) + return; + pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); + if (!pd->dfs_d_root) + return; + + pd->dfs_f_info = debugfs_create_file("info", 0444, + pd->dfs_d_root, pd, &debug_fops); +} + +static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) +{ + if (!pkt_debugfs_root) + return; + debugfs_remove(pd->dfs_f_info); + debugfs_remove(pd->dfs_d_root); + pd->dfs_f_info = NULL; + pd->dfs_d_root = NULL; +} + +static void pkt_debugfs_init(void) +{ + pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); +} + +static void pkt_debugfs_cleanup(void) +{ + debugfs_remove(pkt_debugfs_root); + pkt_debugfs_root = NULL; +} + +/* ----------------------------------------------------------*/ + + +static void pkt_bio_finished(struct pktcdvd_device *pd) +{ + BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); + if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { + pkt_dbg(2, pd, "queue empty\n"); + atomic_set(&pd->iosched.attention, 1); + wake_up(&pd->wqueue); + } +} + +/* + * Allocate a packet_data struct + */ +static struct packet_data *pkt_alloc_packet_data(int frames) +{ + int i; + struct packet_data *pkt; + + pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL); + if (!pkt) + goto no_pkt; + + pkt->frames = frames; + pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL); + if (!pkt->w_bio) + goto no_bio; + + for (i = 0; i < frames / FRAMES_PER_PAGE; i++) { + pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!pkt->pages[i]) + goto no_page; + } + + spin_lock_init(&pkt->lock); + bio_list_init(&pkt->orig_bios); + + for (i = 0; i < frames; i++) { + pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL); + if (!pkt->r_bios[i]) + goto no_rd_bio; + } + + return pkt; + +no_rd_bio: + for (i = 0; i < frames; i++) + kfree(pkt->r_bios[i]); +no_page: + for (i = 0; i < frames / FRAMES_PER_PAGE; i++) + if (pkt->pages[i]) + __free_page(pkt->pages[i]); + kfree(pkt->w_bio); +no_bio: + kfree(pkt); +no_pkt: + return NULL; +} + +/* + * Free a packet_data struct + */ +static void pkt_free_packet_data(struct packet_data *pkt) +{ + int i; + + for (i = 0; i < pkt->frames; i++) + kfree(pkt->r_bios[i]); + for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) + __free_page(pkt->pages[i]); + kfree(pkt->w_bio); + kfree(pkt); +} + +static void pkt_shrink_pktlist(struct pktcdvd_device *pd) +{ + struct packet_data *pkt, *next; + + BUG_ON(!list_empty(&pd->cdrw.pkt_active_list)); + + list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) { + pkt_free_packet_data(pkt); + } + INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); +} + +static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets) +{ + struct packet_data *pkt; + + BUG_ON(!list_empty(&pd->cdrw.pkt_free_list)); + + while (nr_packets > 0) { + pkt = pkt_alloc_packet_data(pd->settings.size >> 2); + if (!pkt) { + pkt_shrink_pktlist(pd); + return 0; + } + pkt->id = nr_packets; + pkt->pd = pd; + list_add(&pkt->list, &pd->cdrw.pkt_free_list); + nr_packets--; + } + return 1; +} + +static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node) +{ + struct rb_node *n = rb_next(&node->rb_node); + if (!n) + return NULL; + return rb_entry(n, struct pkt_rb_node, rb_node); +} + +static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node) +{ + rb_erase(&node->rb_node, &pd->bio_queue); + mempool_free(node, &pd->rb_pool); + pd->bio_queue_size--; + BUG_ON(pd->bio_queue_size < 0); +} + +/* + * Find the first node in the pd->bio_queue rb tree with a starting sector >= s. + */ +static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s) +{ + struct rb_node *n = pd->bio_queue.rb_node; + struct rb_node *next; + struct pkt_rb_node *tmp; + + if (!n) { + BUG_ON(pd->bio_queue_size > 0); + return NULL; + } + + for (;;) { + tmp = rb_entry(n, struct pkt_rb_node, rb_node); + if (s <= tmp->bio->bi_iter.bi_sector) + next = n->rb_left; + else + next = n->rb_right; + if (!next) + break; + n = next; + } + + if (s > tmp->bio->bi_iter.bi_sector) { + tmp = pkt_rbtree_next(tmp); + if (!tmp) + return NULL; + } + BUG_ON(s > tmp->bio->bi_iter.bi_sector); + return tmp; +} + +/* + * Insert a node into the pd->bio_queue rb tree. + */ +static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node) +{ + struct rb_node **p = &pd->bio_queue.rb_node; + struct rb_node *parent = NULL; + sector_t s = node->bio->bi_iter.bi_sector; + struct pkt_rb_node *tmp; + + while (*p) { + parent = *p; + tmp = rb_entry(parent, struct pkt_rb_node, rb_node); + if (s < tmp->bio->bi_iter.bi_sector) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&node->rb_node, parent, p); + rb_insert_color(&node->rb_node, &pd->bio_queue); + pd->bio_queue_size++; +} + +/* + * Send a packet_command to the underlying block device and + * wait for completion. + */ +static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) +{ + struct request_queue *q = bdev_get_queue(pd->bdev); + struct scsi_cmnd *scmd; + struct request *rq; + int ret = 0; + + rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); + if (IS_ERR(rq)) + return PTR_ERR(rq); + scmd = blk_mq_rq_to_pdu(rq); + + if (cgc->buflen) { + ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, + GFP_NOIO); + if (ret) + goto out; + } + + scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]); + memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE); + + rq->timeout = 60*HZ; + if (cgc->quiet) + rq->rq_flags |= RQF_QUIET; + + blk_execute_rq(rq, false); + if (scmd->result) + ret = -EIO; +out: + blk_mq_free_request(rq); + return ret; +} + +static const char *sense_key_string(__u8 index) +{ + static const char * const info[] = { + "No sense", "Recovered error", "Not ready", + "Medium error", "Hardware error", "Illegal request", + "Unit attention", "Data protect", "Blank check", + }; + + return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; +} + +/* + * A generic sense dump / resolve mechanism should be implemented across + * all ATAPI + SCSI devices. + */ +static void pkt_dump_sense(struct pktcdvd_device *pd, + struct packet_command *cgc) +{ + struct scsi_sense_hdr *sshdr = cgc->sshdr; + + if (sshdr) + pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", + CDROM_PACKET_SIZE, cgc->cmd, + sshdr->sense_key, sshdr->asc, sshdr->ascq, + sense_key_string(sshdr->sense_key)); + else + pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); +} + +/* + * flush the drive cache to media + */ +static int pkt_flush_cache(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.cmd[0] = GPCMD_FLUSH_CACHE; + cgc.quiet = 1; + + /* + * the IMMED bit -- we default to not setting it, although that + * would allow a much faster close, this is safer + */ +#if 0 + cgc.cmd[1] = 1 << 1; +#endif + return pkt_generic_packet(pd, &cgc); +} + +/* + * speed is given as the normal factor, e.g. 4 for 4x + */ +static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, + unsigned write_speed, unsigned read_speed) +{ + struct packet_command cgc; + struct scsi_sense_hdr sshdr; + int ret; + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.sshdr = &sshdr; + cgc.cmd[0] = GPCMD_SET_SPEED; + cgc.cmd[2] = (read_speed >> 8) & 0xff; + cgc.cmd[3] = read_speed & 0xff; + cgc.cmd[4] = (write_speed >> 8) & 0xff; + cgc.cmd[5] = write_speed & 0xff; + + ret = pkt_generic_packet(pd, &cgc); + if (ret) + pkt_dump_sense(pd, &cgc); + + return ret; +} + +/* + * Queue a bio for processing by the low-level CD device. Must be called + * from process context. + */ +static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) +{ + spin_lock(&pd->iosched.lock); + if (bio_data_dir(bio) == READ) + bio_list_add(&pd->iosched.read_queue, bio); + else + bio_list_add(&pd->iosched.write_queue, bio); + spin_unlock(&pd->iosched.lock); + + atomic_set(&pd->iosched.attention, 1); + wake_up(&pd->wqueue); +} + +/* + * Process the queued read/write requests. This function handles special + * requirements for CDRW drives: + * - A cache flush command must be inserted before a read request if the + * previous request was a write. + * - Switching between reading and writing is slow, so don't do it more often + * than necessary. + * - Optimize for throughput at the expense of latency. This means that streaming + * writes will never be interrupted by a read, but if the drive has to seek + * before the next write, switch to reading instead if there are any pending + * read requests. + * - Set the read speed according to current usage pattern. When only reading + * from the device, it's best to use the highest possible read speed, but + * when switching often between reading and writing, it's better to have the + * same read and write speeds. + */ +static void pkt_iosched_process_queue(struct pktcdvd_device *pd) +{ + + if (atomic_read(&pd->iosched.attention) == 0) + return; + atomic_set(&pd->iosched.attention, 0); + + for (;;) { + struct bio *bio; + int reads_queued, writes_queued; + + spin_lock(&pd->iosched.lock); + reads_queued = !bio_list_empty(&pd->iosched.read_queue); + writes_queued = !bio_list_empty(&pd->iosched.write_queue); + spin_unlock(&pd->iosched.lock); + + if (!reads_queued && !writes_queued) + break; + + if (pd->iosched.writing) { + int need_write_seek = 1; + spin_lock(&pd->iosched.lock); + bio = bio_list_peek(&pd->iosched.write_queue); + spin_unlock(&pd->iosched.lock); + if (bio && (bio->bi_iter.bi_sector == + pd->iosched.last_write)) + need_write_seek = 0; + if (need_write_seek && reads_queued) { + if (atomic_read(&pd->cdrw.pending_bios) > 0) { + pkt_dbg(2, pd, "write, waiting\n"); + break; + } + pkt_flush_cache(pd); + pd->iosched.writing = 0; + } + } else { + if (!reads_queued && writes_queued) { + if (atomic_read(&pd->cdrw.pending_bios) > 0) { + pkt_dbg(2, pd, "read, waiting\n"); + break; + } + pd->iosched.writing = 1; + } + } + + spin_lock(&pd->iosched.lock); + if (pd->iosched.writing) + bio = bio_list_pop(&pd->iosched.write_queue); + else + bio = bio_list_pop(&pd->iosched.read_queue); + spin_unlock(&pd->iosched.lock); + + if (!bio) + continue; + + if (bio_data_dir(bio) == READ) + pd->iosched.successive_reads += + bio->bi_iter.bi_size >> 10; + else { + pd->iosched.successive_reads = 0; + pd->iosched.last_write = bio_end_sector(bio); + } + if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { + if (pd->read_speed == pd->write_speed) { + pd->read_speed = MAX_SPEED; + pkt_set_speed(pd, pd->write_speed, pd->read_speed); + } + } else { + if (pd->read_speed != pd->write_speed) { + pd->read_speed = pd->write_speed; + pkt_set_speed(pd, pd->write_speed, pd->read_speed); + } + } + + atomic_inc(&pd->cdrw.pending_bios); + submit_bio_noacct(bio); + } +} + +/* + * Special care is needed if the underlying block device has a small + * max_phys_segments value. + */ +static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) +{ + if ((pd->settings.size << 9) / CD_FRAMESIZE + <= queue_max_segments(q)) { + /* + * The cdrom device can handle one segment/frame + */ + clear_bit(PACKET_MERGE_SEGS, &pd->flags); + return 0; + } else if ((pd->settings.size << 9) / PAGE_SIZE + <= queue_max_segments(q)) { + /* + * We can handle this case at the expense of some extra memory + * copies during write operations + */ + set_bit(PACKET_MERGE_SEGS, &pd->flags); + return 0; + } else { + pkt_err(pd, "cdrom max_phys_segments too small\n"); + return -EIO; + } +} + +static void pkt_end_io_read(struct bio *bio) +{ + struct packet_data *pkt = bio->bi_private; + struct pktcdvd_device *pd = pkt->pd; + BUG_ON(!pd); + + pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", + bio, (unsigned long long)pkt->sector, + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); + + if (bio->bi_status) + atomic_inc(&pkt->io_errors); + bio_uninit(bio); + if (atomic_dec_and_test(&pkt->io_wait)) { + atomic_inc(&pkt->run_sm); + wake_up(&pd->wqueue); + } + pkt_bio_finished(pd); +} + +static void pkt_end_io_packet_write(struct bio *bio) +{ + struct packet_data *pkt = bio->bi_private; + struct pktcdvd_device *pd = pkt->pd; + BUG_ON(!pd); + + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); + + pd->stats.pkt_ended++; + + bio_uninit(bio); + pkt_bio_finished(pd); + atomic_dec(&pkt->io_wait); + atomic_inc(&pkt->run_sm); + wake_up(&pd->wqueue); +} + +/* + * Schedule reads for the holes in a packet + */ +static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + int frames_read = 0; + struct bio *bio; + int f; + char written[PACKET_MAX_SIZE]; + + BUG_ON(bio_list_empty(&pkt->orig_bios)); + + atomic_set(&pkt->io_wait, 0); + atomic_set(&pkt->io_errors, 0); + + /* + * Figure out which frames we need to read before we can write. + */ + memset(written, 0, sizeof(written)); + spin_lock(&pkt->lock); + bio_list_for_each(bio, &pkt->orig_bios) { + int first_frame = (bio->bi_iter.bi_sector - pkt->sector) / + (CD_FRAMESIZE >> 9); + int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE; + pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); + BUG_ON(first_frame < 0); + BUG_ON(first_frame + num_frames > pkt->frames); + for (f = first_frame; f < first_frame + num_frames; f++) + written[f] = 1; + } + spin_unlock(&pkt->lock); + + if (pkt->cache_valid) { + pkt_dbg(2, pd, "zone %llx cached\n", + (unsigned long long)pkt->sector); + goto out_account; + } + + /* + * Schedule reads for missing parts of the packet. + */ + for (f = 0; f < pkt->frames; f++) { + int p, offset; + + if (written[f]) + continue; + + bio = pkt->r_bios[f]; + bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ); + bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); + bio->bi_end_io = pkt_end_io_read; + bio->bi_private = pkt; + + p = (f * CD_FRAMESIZE) / PAGE_SIZE; + offset = (f * CD_FRAMESIZE) % PAGE_SIZE; + pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", + f, pkt->pages[p], offset); + if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) + BUG(); + + atomic_inc(&pkt->io_wait); + pkt_queue_bio(pd, bio); + frames_read++; + } + +out_account: + pkt_dbg(2, pd, "need %d frames for zone %llx\n", + frames_read, (unsigned long long)pkt->sector); + pd->stats.pkt_started++; + pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); +} + +/* + * Find a packet matching zone, or the least recently used packet if + * there is no match. + */ +static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone) +{ + struct packet_data *pkt; + + list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) { + if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) { + list_del_init(&pkt->list); + if (pkt->sector != zone) + pkt->cache_valid = 0; + return pkt; + } + } + BUG(); + return NULL; +} + +static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + if (pkt->cache_valid) { + list_add(&pkt->list, &pd->cdrw.pkt_free_list); + } else { + list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list); + } +} + +static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state) +{ +#if PACKET_DEBUG > 1 + static const char *state_name[] = { + "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" + }; + enum packet_data_state old_state = pkt->state; + pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", + pkt->id, (unsigned long long)pkt->sector, + state_name[old_state], state_name[state]); +#endif + pkt->state = state; +} + +/* + * Scan the work queue to see if we can start a new packet. + * returns non-zero if any work was done. + */ +static int pkt_handle_queue(struct pktcdvd_device *pd) +{ + struct packet_data *pkt, *p; + struct bio *bio = NULL; + sector_t zone = 0; /* Suppress gcc warning */ + struct pkt_rb_node *node, *first_node; + struct rb_node *n; + + atomic_set(&pd->scan_queue, 0); + + if (list_empty(&pd->cdrw.pkt_free_list)) { + pkt_dbg(2, pd, "no pkt\n"); + return 0; + } + + /* + * Try to find a zone we are not already working on. + */ + spin_lock(&pd->lock); + first_node = pkt_rbtree_find(pd, pd->current_sector); + if (!first_node) { + n = rb_first(&pd->bio_queue); + if (n) + first_node = rb_entry(n, struct pkt_rb_node, rb_node); + } + node = first_node; + while (node) { + bio = node->bio; + zone = get_zone(bio->bi_iter.bi_sector, pd); + list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { + if (p->sector == zone) { + bio = NULL; + goto try_next_bio; + } + } + break; +try_next_bio: + node = pkt_rbtree_next(node); + if (!node) { + n = rb_first(&pd->bio_queue); + if (n) + node = rb_entry(n, struct pkt_rb_node, rb_node); + } + if (node == first_node) + node = NULL; + } + spin_unlock(&pd->lock); + if (!bio) { + pkt_dbg(2, pd, "no bio\n"); + return 0; + } + + pkt = pkt_get_packet_data(pd, zone); + + pd->current_sector = zone + pd->settings.size; + pkt->sector = zone; + BUG_ON(pkt->frames != pd->settings.size >> 2); + pkt->write_size = 0; + + /* + * Scan work queue for bios in the same zone and link them + * to this packet. + */ + spin_lock(&pd->lock); + pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); + while ((node = pkt_rbtree_find(pd, zone)) != NULL) { + bio = node->bio; + pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long) + get_zone(bio->bi_iter.bi_sector, pd)); + if (get_zone(bio->bi_iter.bi_sector, pd) != zone) + break; + pkt_rbtree_erase(pd, node); + spin_lock(&pkt->lock); + bio_list_add(&pkt->orig_bios, bio); + pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE; + spin_unlock(&pkt->lock); + } + /* check write congestion marks, and if bio_queue_size is + * below, wake up any waiters + */ + if (pd->congested && + pd->bio_queue_size <= pd->write_congestion_off) { + pd->congested = false; + wake_up_var(&pd->congested); + } + spin_unlock(&pd->lock); + + pkt->sleep_time = max(PACKET_WAIT_TIME, 1); + pkt_set_state(pkt, PACKET_WAITING_STATE); + atomic_set(&pkt->run_sm, 1); + + spin_lock(&pd->cdrw.active_list_lock); + list_add(&pkt->list, &pd->cdrw.pkt_active_list); + spin_unlock(&pd->cdrw.active_list_lock); + + return 1; +} + +/** + * bio_list_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * Stops when it reaches the end of either the @src list or @dst list - that is, + * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of + * bios). + */ +static void bio_list_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; + + while (1) { + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; + + src_iter = src->bi_iter; + } + + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; + + dst_iter = dst->bi_iter; + } + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } +} + +/* + * Assemble a bio to write one packet and queue the bio for processing + * by the underlying block device. + */ +static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + int f; + + bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames, + REQ_OP_WRITE); + pkt->w_bio->bi_iter.bi_sector = pkt->sector; + pkt->w_bio->bi_end_io = pkt_end_io_packet_write; + pkt->w_bio->bi_private = pkt; + + /* XXX: locking? */ + for (f = 0; f < pkt->frames; f++) { + struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; + unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE; + + if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset)) + BUG(); + } + pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); + + /* + * Fill-in bvec with data from orig_bios. + */ + spin_lock(&pkt->lock); + bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); + + pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); + spin_unlock(&pkt->lock); + + pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", + pkt->write_size, (unsigned long long)pkt->sector); + + if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) + pkt->cache_valid = 1; + else + pkt->cache_valid = 0; + + /* Start the write request */ + atomic_set(&pkt->io_wait, 1); + pkt_queue_bio(pd, pkt->w_bio); +} + +static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) +{ + struct bio *bio; + + if (status) + pkt->cache_valid = 0; + + /* Finish all bios corresponding to this packet */ + while ((bio = bio_list_pop(&pkt->orig_bios))) { + bio->bi_status = status; + bio_endio(bio); + } +} + +static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + pkt_dbg(2, pd, "pkt %d\n", pkt->id); + + for (;;) { + switch (pkt->state) { + case PACKET_WAITING_STATE: + if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0)) + return; + + pkt->sleep_time = 0; + pkt_gather_data(pd, pkt); + pkt_set_state(pkt, PACKET_READ_WAIT_STATE); + break; + + case PACKET_READ_WAIT_STATE: + if (atomic_read(&pkt->io_wait) > 0) + return; + + if (atomic_read(&pkt->io_errors) > 0) { + pkt_set_state(pkt, PACKET_RECOVERY_STATE); + } else { + pkt_start_write(pd, pkt); + } + break; + + case PACKET_WRITE_WAIT_STATE: + if (atomic_read(&pkt->io_wait) > 0) + return; + + if (!pkt->w_bio->bi_status) { + pkt_set_state(pkt, PACKET_FINISHED_STATE); + } else { + pkt_set_state(pkt, PACKET_RECOVERY_STATE); + } + break; + + case PACKET_RECOVERY_STATE: + pkt_dbg(2, pd, "No recovery possible\n"); + pkt_set_state(pkt, PACKET_FINISHED_STATE); + break; + + case PACKET_FINISHED_STATE: + pkt_finish_packet(pkt, pkt->w_bio->bi_status); + return; + + default: + BUG(); + break; + } + } +} + +static void pkt_handle_packets(struct pktcdvd_device *pd) +{ + struct packet_data *pkt, *next; + + /* + * Run state machine for active packets + */ + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (atomic_read(&pkt->run_sm) > 0) { + atomic_set(&pkt->run_sm, 0); + pkt_run_state_machine(pd, pkt); + } + } + + /* + * Move no longer active packets to the free list + */ + spin_lock(&pd->cdrw.active_list_lock); + list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) { + if (pkt->state == PACKET_FINISHED_STATE) { + list_del(&pkt->list); + pkt_put_packet_data(pd, pkt); + pkt_set_state(pkt, PACKET_IDLE_STATE); + atomic_set(&pd->scan_queue, 1); + } + } + spin_unlock(&pd->cdrw.active_list_lock); +} + +static void pkt_count_states(struct pktcdvd_device *pd, int *states) +{ + struct packet_data *pkt; + int i; + + for (i = 0; i < PACKET_NUM_STATES; i++) + states[i] = 0; + + spin_lock(&pd->cdrw.active_list_lock); + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + states[pkt->state]++; + } + spin_unlock(&pd->cdrw.active_list_lock); +} + +/* + * kcdrwd is woken up when writes have been queued for one of our + * registered devices + */ +static int kcdrwd(void *foobar) +{ + struct pktcdvd_device *pd = foobar; + struct packet_data *pkt; + long min_sleep_time, residue; + + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + DECLARE_WAITQUEUE(wait, current); + + /* + * Wait until there is something to do + */ + add_wait_queue(&pd->wqueue, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + /* Check if we need to run pkt_handle_queue */ + if (atomic_read(&pd->scan_queue) > 0) + goto work_to_do; + + /* Check if we need to run the state machine for some packet */ + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (atomic_read(&pkt->run_sm) > 0) + goto work_to_do; + } + + /* Check if we need to process the iosched queues */ + if (atomic_read(&pd->iosched.attention) != 0) + goto work_to_do; + + /* Otherwise, go to sleep */ + if (PACKET_DEBUG > 1) { + int states[PACKET_NUM_STATES]; + pkt_count_states(pd, states); + pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], + states[3], states[4], states[5]); + } + + min_sleep_time = MAX_SCHEDULE_TIMEOUT; + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (pkt->sleep_time && pkt->sleep_time < min_sleep_time) + min_sleep_time = pkt->sleep_time; + } + + pkt_dbg(2, pd, "sleeping\n"); + residue = schedule_timeout(min_sleep_time); + pkt_dbg(2, pd, "wake up\n"); + + /* make swsusp happy with our thread */ + try_to_freeze(); + + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (!pkt->sleep_time) + continue; + pkt->sleep_time -= min_sleep_time - residue; + if (pkt->sleep_time <= 0) { + pkt->sleep_time = 0; + atomic_inc(&pkt->run_sm); + } + } + + if (kthread_should_stop()) + break; + } +work_to_do: + set_current_state(TASK_RUNNING); + remove_wait_queue(&pd->wqueue, &wait); + + if (kthread_should_stop()) + break; + + /* + * if pkt_handle_queue returns true, we can queue + * another request. + */ + while (pkt_handle_queue(pd)) + ; + + /* + * Handle packet state machine + */ + pkt_handle_packets(pd); + + /* + * Handle iosched queues + */ + pkt_iosched_process_queue(pd); + } + + return 0; +} + +static void pkt_print_settings(struct pktcdvd_device *pd) +{ + pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", + pd->settings.fp ? "Fixed" : "Variable", + pd->settings.size >> 2, + pd->settings.block_mode == 8 ? '1' : '2'); +} + +static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) +{ + memset(cgc->cmd, 0, sizeof(cgc->cmd)); + + cgc->cmd[0] = GPCMD_MODE_SENSE_10; + cgc->cmd[2] = page_code | (page_control << 6); + cgc->cmd[7] = cgc->buflen >> 8; + cgc->cmd[8] = cgc->buflen & 0xff; + cgc->data_direction = CGC_DATA_READ; + return pkt_generic_packet(pd, cgc); +} + +static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc) +{ + memset(cgc->cmd, 0, sizeof(cgc->cmd)); + memset(cgc->buffer, 0, 2); + cgc->cmd[0] = GPCMD_MODE_SELECT_10; + cgc->cmd[1] = 0x10; /* PF */ + cgc->cmd[7] = cgc->buflen >> 8; + cgc->cmd[8] = cgc->buflen & 0xff; + cgc->data_direction = CGC_DATA_WRITE; + return pkt_generic_packet(pd, cgc); +} + +static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) +{ + struct packet_command cgc; + int ret; + + /* set up command and get the disc info */ + init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); + cgc.cmd[0] = GPCMD_READ_DISC_INFO; + cgc.cmd[8] = cgc.buflen = 2; + cgc.quiet = 1; + + ret = pkt_generic_packet(pd, &cgc); + if (ret) + return ret; + + /* not all drives have the same disc_info length, so requeue + * packet with the length the drive tells us it can supply + */ + cgc.buflen = be16_to_cpu(di->disc_information_length) + + sizeof(di->disc_information_length); + + if (cgc.buflen > sizeof(disc_information)) + cgc.buflen = sizeof(disc_information); + + cgc.cmd[8] = cgc.buflen; + return pkt_generic_packet(pd, &cgc); +} + +static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti) +{ + struct packet_command cgc; + int ret; + + init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); + cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; + cgc.cmd[1] = type & 3; + cgc.cmd[4] = (track & 0xff00) >> 8; + cgc.cmd[5] = track & 0xff; + cgc.cmd[8] = 8; + cgc.quiet = 1; + + ret = pkt_generic_packet(pd, &cgc); + if (ret) + return ret; + + cgc.buflen = be16_to_cpu(ti->track_information_length) + + sizeof(ti->track_information_length); + + if (cgc.buflen > sizeof(track_information)) + cgc.buflen = sizeof(track_information); + + cgc.cmd[8] = cgc.buflen; + return pkt_generic_packet(pd, &cgc); +} + +static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, + long *last_written) +{ + disc_information di; + track_information ti; + __u32 last_track; + int ret; + + ret = pkt_get_disc_info(pd, &di); + if (ret) + return ret; + + last_track = (di.last_track_msb << 8) | di.last_track_lsb; + ret = pkt_get_track_info(pd, last_track, 1, &ti); + if (ret) + return ret; + + /* if this track is blank, try the previous. */ + if (ti.blank) { + last_track--; + ret = pkt_get_track_info(pd, last_track, 1, &ti); + if (ret) + return ret; + } + + /* if last recorded field is valid, return it. */ + if (ti.lra_v) { + *last_written = be32_to_cpu(ti.last_rec_address); + } else { + /* make it up instead */ + *last_written = be32_to_cpu(ti.track_start) + + be32_to_cpu(ti.track_size); + if (ti.free_blocks) + *last_written -= (be32_to_cpu(ti.free_blocks) + 7); + } + return 0; +} + +/* + * write mode select package based on pd->settings + */ +static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + struct scsi_sense_hdr sshdr; + write_param_page *wp; + char buffer[128]; + int ret, size; + + /* doesn't apply to DVD+RW or DVD-RAM */ + if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12)) + return 0; + + memset(buffer, 0, sizeof(buffer)); + init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); + cgc.sshdr = &sshdr; + ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff)); + pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff); + if (size > sizeof(buffer)) + size = sizeof(buffer); + + /* + * now get it all + */ + init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); + cgc.sshdr = &sshdr; + ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + /* + * write page is offset header + block descriptor length + */ + wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset]; + + wp->fp = pd->settings.fp; + wp->track_mode = pd->settings.track_mode; + wp->write_type = pd->settings.write_type; + wp->data_block_type = pd->settings.block_mode; + + wp->multi_session = 0; + +#ifdef PACKET_USE_LS + wp->link_size = 7; + wp->ls_v = 1; +#endif + + if (wp->data_block_type == PACKET_BLOCK_MODE1) { + wp->session_format = 0; + wp->subhdr2 = 0x20; + } else if (wp->data_block_type == PACKET_BLOCK_MODE2) { + wp->session_format = 0x20; + wp->subhdr2 = 8; +#if 0 + wp->mcn[0] = 0x80; + memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1); +#endif + } else { + /* + * paranoia + */ + pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); + return 1; + } + wp->packet_size = cpu_to_be32(pd->settings.size >> 2); + + cgc.buflen = cgc.cmd[8] = size; + ret = pkt_mode_select(pd, &cgc); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + pkt_print_settings(pd); + return 0; +} + +/* + * 1 -- we can write to this track, 0 -- we can't + */ +static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) +{ + switch (pd->mmc3_profile) { + case 0x1a: /* DVD+RW */ + case 0x12: /* DVD-RAM */ + /* The track is always writable on DVD+RW/DVD-RAM */ + return 1; + default: + break; + } + + if (!ti->packet || !ti->fp) + return 0; + + /* + * "good" settings as per Mt Fuji. + */ + if (ti->rt == 0 && ti->blank == 0) + return 1; + + if (ti->rt == 0 && ti->blank == 1) + return 1; + + if (ti->rt == 1 && ti->blank == 0) + return 1; + + pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + return 0; +} + +/* + * 1 -- we can write to this disc, 0 -- we can't + */ +static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) +{ + switch (pd->mmc3_profile) { + case 0x0a: /* CD-RW */ + case 0xffff: /* MMC3 not supported */ + break; + case 0x1a: /* DVD+RW */ + case 0x13: /* DVD-RW */ + case 0x12: /* DVD-RAM */ + return 1; + default: + pkt_dbg(2, pd, "Wrong disc profile (%x)\n", + pd->mmc3_profile); + return 0; + } + + /* + * for disc type 0xff we should probably reserve a new track. + * but i'm not sure, should we leave this to user apps? probably. + */ + if (di->disc_type == 0xff) { + pkt_notice(pd, "unknown disc - no track?\n"); + return 0; + } + + if (di->disc_type != 0x20 && di->disc_type != 0) { + pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); + return 0; + } + + if (di->erasable == 0) { + pkt_notice(pd, "disc not erasable\n"); + return 0; + } + + if (di->border_status == PACKET_SESSION_RESERVED) { + pkt_err(pd, "can't write to last track (reserved)\n"); + return 0; + } + + return 1; +} + +static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + unsigned char buf[12]; + disc_information di; + track_information ti; + int ret, track; + + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); + cgc.cmd[0] = GPCMD_GET_CONFIGURATION; + cgc.cmd[8] = 8; + ret = pkt_generic_packet(pd, &cgc); + pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7]; + + memset(&di, 0, sizeof(disc_information)); + memset(&ti, 0, sizeof(track_information)); + + ret = pkt_get_disc_info(pd, &di); + if (ret) { + pkt_err(pd, "failed get_disc\n"); + return ret; + } + + if (!pkt_writable_disc(pd, &di)) + return -EROFS; + + pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; + + track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ + ret = pkt_get_track_info(pd, track, 1, &ti); + if (ret) { + pkt_err(pd, "failed get_track\n"); + return ret; + } + + if (!pkt_writable_track(pd, &ti)) { + pkt_err(pd, "can't write to this track\n"); + return -EROFS; + } + + /* + * we keep packet size in 512 byte units, makes it easier to + * deal with request calculations. + */ + pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; + if (pd->settings.size == 0) { + pkt_notice(pd, "detected zero packet size!\n"); + return -ENXIO; + } + if (pd->settings.size > PACKET_MAX_SECTORS) { + pkt_err(pd, "packet size is too big\n"); + return -EROFS; + } + pd->settings.fp = ti.fp; + pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1); + + if (ti.nwa_v) { + pd->nwa = be32_to_cpu(ti.next_writable); + set_bit(PACKET_NWA_VALID, &pd->flags); + } + + /* + * in theory we could use lra on -RW media as well and just zero + * blocks that haven't been written yet, but in practice that + * is just a no-go. we'll use that for -R, naturally. + */ + if (ti.lra_v) { + pd->lra = be32_to_cpu(ti.last_rec_address); + set_bit(PACKET_LRA_VALID, &pd->flags); + } else { + pd->lra = 0xffffffff; + set_bit(PACKET_LRA_VALID, &pd->flags); + } + + /* + * fine for now + */ + pd->settings.link_loss = 7; + pd->settings.write_type = 0; /* packet */ + pd->settings.track_mode = ti.track_mode; + + /* + * mode1 or mode2 disc + */ + switch (ti.data_mode) { + case PACKET_MODE1: + pd->settings.block_mode = PACKET_BLOCK_MODE1; + break; + case PACKET_MODE2: + pd->settings.block_mode = PACKET_BLOCK_MODE2; + break; + default: + pkt_err(pd, "unknown data mode\n"); + return -EROFS; + } + return 0; +} + +/* + * enable/disable write caching on drive + */ +static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, + int set) +{ + struct packet_command cgc; + struct scsi_sense_hdr sshdr; + unsigned char buf[64]; + int ret; + + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); + cgc.sshdr = &sshdr; + cgc.buflen = pd->mode_offset + 12; + + /* + * caching mode page might not be there, so quiet this command + */ + cgc.quiet = 1; + + ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0); + if (ret) + return ret; + + buf[pd->mode_offset + 10] |= (!!set << 2); + + cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); + ret = pkt_mode_select(pd, &cgc); + if (ret) { + pkt_err(pd, "write caching control failed\n"); + pkt_dump_sense(pd, &cgc); + } else if (!ret && set) + pkt_notice(pd, "enabled write caching\n"); + return ret; +} + +static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag) +{ + struct packet_command cgc; + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL; + cgc.cmd[4] = lockflag ? 1 : 0; + return pkt_generic_packet(pd, &cgc); +} + +/* + * Returns drive maximum write speed + */ +static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, + unsigned *write_speed) +{ + struct packet_command cgc; + struct scsi_sense_hdr sshdr; + unsigned char buf[256+18]; + unsigned char *cap_buf; + int ret, offset; + + cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); + cgc.sshdr = &sshdr; + + ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); + if (ret) { + cgc.buflen = pd->mode_offset + cap_buf[1] + 2 + + sizeof(struct mode_page_header); + ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + } + + offset = 20; /* Obsoleted field, used by older drives */ + if (cap_buf[1] >= 28) + offset = 28; /* Current write speed selected */ + if (cap_buf[1] >= 30) { + /* If the drive reports at least one "Logical Unit Write + * Speed Performance Descriptor Block", use the information + * in the first block. (contains the highest speed) + */ + int num_spdb = (cap_buf[30] << 8) + cap_buf[31]; + if (num_spdb > 0) + offset = 34; + } + + *write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1]; + return 0; +} + +/* These tables from cdrecord - I don't have orange book */ +/* standard speed CD-RW (1-4x) */ +static char clv_to_speed[16] = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +/* high speed CD-RW (-10x) */ +static char hs_clv_to_speed[16] = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +/* ultra high speed CD-RW */ +static char us_clv_to_speed[16] = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0 +}; + +/* + * reads the maximum media speed from ATIP + */ +static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, + unsigned *speed) +{ + struct packet_command cgc; + struct scsi_sense_hdr sshdr; + unsigned char buf[64]; + unsigned int size, st, sp; + int ret; + + init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); + cgc.sshdr = &sshdr; + cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; + cgc.cmd[1] = 2; + cgc.cmd[2] = 4; /* READ ATIP */ + cgc.cmd[8] = 2; + ret = pkt_generic_packet(pd, &cgc); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + size = ((unsigned int) buf[0]<<8) + buf[1] + 2; + if (size > sizeof(buf)) + size = sizeof(buf); + + init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); + cgc.sshdr = &sshdr; + cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; + cgc.cmd[1] = 2; + cgc.cmd[2] = 4; + cgc.cmd[8] = size; + ret = pkt_generic_packet(pd, &cgc); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + if (!(buf[6] & 0x40)) { + pkt_notice(pd, "disc type is not CD-RW\n"); + return 1; + } + if (!(buf[6] & 0x4)) { + pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); + return 1; + } + + st = (buf[6] >> 3) & 0x7; /* disc sub-type */ + + sp = buf[16] & 0xf; /* max speed from ATIP A1 field */ + + /* Info from cdrecord */ + switch (st) { + case 0: /* standard speed */ + *speed = clv_to_speed[sp]; + break; + case 1: /* high speed */ + *speed = hs_clv_to_speed[sp]; + break; + case 2: /* ultra high speed */ + *speed = us_clv_to_speed[sp]; + break; + default: + pkt_notice(pd, "unknown disc sub-type %d\n", st); + return 1; + } + if (*speed) { + pkt_info(pd, "maximum media speed: %d\n", *speed); + return 0; + } else { + pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); + return 1; + } +} + +static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + struct scsi_sense_hdr sshdr; + int ret; + + pkt_dbg(2, pd, "Performing OPC\n"); + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.sshdr = &sshdr; + cgc.timeout = 60*HZ; + cgc.cmd[0] = GPCMD_SEND_OPC; + cgc.cmd[1] = 1; + ret = pkt_generic_packet(pd, &cgc); + if (ret) + pkt_dump_sense(pd, &cgc); + return ret; +} + +static int pkt_open_write(struct pktcdvd_device *pd) +{ + int ret; + unsigned int write_speed, media_write_speed, read_speed; + + ret = pkt_probe_settings(pd); + if (ret) { + pkt_dbg(2, pd, "failed probe\n"); + return ret; + } + + ret = pkt_set_write_settings(pd); + if (ret) { + pkt_dbg(1, pd, "failed saving write settings\n"); + return -EIO; + } + + pkt_write_caching(pd, USE_WCACHING); + + ret = pkt_get_max_speed(pd, &write_speed); + if (ret) + write_speed = 16 * 177; + switch (pd->mmc3_profile) { + case 0x13: /* DVD-RW */ + case 0x1a: /* DVD+RW */ + case 0x12: /* DVD-RAM */ + pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); + break; + default: + ret = pkt_media_speed(pd, &media_write_speed); + if (ret) + media_write_speed = 16; + write_speed = min(write_speed, media_write_speed * 177); + pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); + break; + } + read_speed = write_speed; + + ret = pkt_set_speed(pd, write_speed, read_speed); + if (ret) { + pkt_dbg(1, pd, "couldn't set write speed\n"); + return -EIO; + } + pd->write_speed = write_speed; + pd->read_speed = read_speed; + + ret = pkt_perform_opc(pd); + if (ret) { + pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); + } + + return 0; +} + +/* + * called at open time. + */ +static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) +{ + int ret; + long lba; + struct request_queue *q; + struct block_device *bdev; + + /* + * We need to re-open the cdrom device without O_NONBLOCK to be able + * to read/write from/to it. It is already opened in O_NONBLOCK mode + * so open should not fail. + */ + bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + ret = pkt_get_last_written(pd, &lba); + if (ret) { + pkt_err(pd, "pkt_get_last_written failed\n"); + goto out_putdev; + } + + set_capacity(pd->disk, lba << 2); + set_capacity_and_notify(pd->bdev->bd_disk, lba << 2); + + q = bdev_get_queue(pd->bdev); + if (write) { + ret = pkt_open_write(pd); + if (ret) + goto out_putdev; + /* + * Some CDRW drives can not handle writes larger than one packet, + * even if the size is a multiple of the packet size. + */ + blk_queue_max_hw_sectors(q, pd->settings.size); + set_bit(PACKET_WRITABLE, &pd->flags); + } else { + pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); + clear_bit(PACKET_WRITABLE, &pd->flags); + } + + ret = pkt_set_segment_merging(pd, q); + if (ret) + goto out_putdev; + + if (write) { + if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { + pkt_err(pd, "not enough memory for buffers\n"); + ret = -ENOMEM; + goto out_putdev; + } + pkt_info(pd, "%lukB available on disc\n", lba << 1); + } + + return 0; + +out_putdev: + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); +out: + return ret; +} + +/* + * called when the device is closed. makes sure that the device flushes + * the internal cache before we close. + */ +static void pkt_release_dev(struct pktcdvd_device *pd, int flush) +{ + if (flush && pkt_flush_cache(pd)) + pkt_dbg(1, pd, "not flushing cache\n"); + + pkt_lock_door(pd, 0); + + pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); + + pkt_shrink_pktlist(pd); +} + +static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) +{ + if (dev_minor >= MAX_WRITERS) + return NULL; + + dev_minor = array_index_nospec(dev_minor, MAX_WRITERS); + return pkt_devs[dev_minor]; +} + +static int pkt_open(struct block_device *bdev, fmode_t mode) +{ + struct pktcdvd_device *pd = NULL; + int ret; + + mutex_lock(&pktcdvd_mutex); + mutex_lock(&ctl_mutex); + pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); + if (!pd) { + ret = -ENODEV; + goto out; + } + BUG_ON(pd->refcnt < 0); + + pd->refcnt++; + if (pd->refcnt > 1) { + if ((mode & FMODE_WRITE) && + !test_bit(PACKET_WRITABLE, &pd->flags)) { + ret = -EBUSY; + goto out_dec; + } + } else { + ret = pkt_open_dev(pd, mode & FMODE_WRITE); + if (ret) + goto out_dec; + /* + * needed here as well, since ext2 (among others) may change + * the blocksize at mount time + */ + set_blocksize(bdev, CD_FRAMESIZE); + } + + mutex_unlock(&ctl_mutex); + mutex_unlock(&pktcdvd_mutex); + return 0; + +out_dec: + pd->refcnt--; +out: + mutex_unlock(&ctl_mutex); + mutex_unlock(&pktcdvd_mutex); + return ret; +} + +static void pkt_close(struct gendisk *disk, fmode_t mode) +{ + struct pktcdvd_device *pd = disk->private_data; + + mutex_lock(&pktcdvd_mutex); + mutex_lock(&ctl_mutex); + pd->refcnt--; + BUG_ON(pd->refcnt < 0); + if (pd->refcnt == 0) { + int flush = test_bit(PACKET_WRITABLE, &pd->flags); + pkt_release_dev(pd, flush); + } + mutex_unlock(&ctl_mutex); + mutex_unlock(&pktcdvd_mutex); +} + + +static void pkt_end_io_read_cloned(struct bio *bio) +{ + struct packet_stacked_data *psd = bio->bi_private; + struct pktcdvd_device *pd = psd->pd; + + psd->bio->bi_status = bio->bi_status; + bio_put(bio); + bio_endio(psd->bio); + mempool_free(psd, &psd_pool); + pkt_bio_finished(pd); +} + +static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) +{ + struct bio *cloned_bio = + bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set); + struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); + + psd->pd = pd; + psd->bio = bio; + cloned_bio->bi_private = psd; + cloned_bio->bi_end_io = pkt_end_io_read_cloned; + pd->stats.secs_r += bio_sectors(bio); + pkt_queue_bio(pd, cloned_bio); +} + +static void pkt_make_request_write(struct request_queue *q, struct bio *bio) +{ + struct pktcdvd_device *pd = q->queuedata; + sector_t zone; + struct packet_data *pkt; + int was_empty, blocked_bio; + struct pkt_rb_node *node; + + zone = get_zone(bio->bi_iter.bi_sector, pd); + + /* + * If we find a matching packet in state WAITING or READ_WAIT, we can + * just append this bio to that packet. + */ + spin_lock(&pd->cdrw.active_list_lock); + blocked_bio = 0; + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (pkt->sector == zone) { + spin_lock(&pkt->lock); + if ((pkt->state == PACKET_WAITING_STATE) || + (pkt->state == PACKET_READ_WAIT_STATE)) { + bio_list_add(&pkt->orig_bios, bio); + pkt->write_size += + bio->bi_iter.bi_size / CD_FRAMESIZE; + if ((pkt->write_size >= pkt->frames) && + (pkt->state == PACKET_WAITING_STATE)) { + atomic_inc(&pkt->run_sm); + wake_up(&pd->wqueue); + } + spin_unlock(&pkt->lock); + spin_unlock(&pd->cdrw.active_list_lock); + return; + } else { + blocked_bio = 1; + } + spin_unlock(&pkt->lock); + } + } + spin_unlock(&pd->cdrw.active_list_lock); + + /* + * Test if there is enough room left in the bio work queue + * (queue size >= congestion on mark). + * If not, wait till the work queue size is below the congestion off mark. + */ + spin_lock(&pd->lock); + if (pd->write_congestion_on > 0 + && pd->bio_queue_size >= pd->write_congestion_on) { + struct wait_bit_queue_entry wqe; + + init_wait_var_entry(&wqe, &pd->congested, 0); + for (;;) { + prepare_to_wait_event(__var_waitqueue(&pd->congested), + &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); + if (pd->bio_queue_size <= pd->write_congestion_off) + break; + pd->congested = true; + spin_unlock(&pd->lock); + schedule(); + spin_lock(&pd->lock); + } + } + spin_unlock(&pd->lock); + + /* + * No matching packet found. Store the bio in the work queue. + */ + node = mempool_alloc(&pd->rb_pool, GFP_NOIO); + node->bio = bio; + spin_lock(&pd->lock); + BUG_ON(pd->bio_queue_size < 0); + was_empty = (pd->bio_queue_size == 0); + pkt_rbtree_insert(pd, node); + spin_unlock(&pd->lock); + + /* + * Wake up the worker thread. + */ + atomic_set(&pd->scan_queue, 1); + if (was_empty) { + /* This wake_up is required for correct operation */ + wake_up(&pd->wqueue); + } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) { + /* + * This wake up is not required for correct operation, + * but improves performance in some cases. + */ + wake_up(&pd->wqueue); + } +} + +static void pkt_submit_bio(struct bio *bio) +{ + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; + struct bio *split; + + bio = bio_split_to_limits(bio); + + pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", + (unsigned long long)bio->bi_iter.bi_sector, + (unsigned long long)bio_end_sector(bio)); + + /* + * Clone READ bios so we can have our own bi_end_io callback. + */ + if (bio_data_dir(bio) == READ) { + pkt_make_request_read(pd, bio); + return; + } + + if (!test_bit(PACKET_WRITABLE, &pd->flags)) { + pkt_notice(pd, "WRITE for ro device (%llu)\n", + (unsigned long long)bio->bi_iter.bi_sector); + goto end_io; + } + + if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { + pkt_err(pd, "wrong bio size\n"); + goto end_io; + } + + do { + sector_t zone = get_zone(bio->bi_iter.bi_sector, pd); + sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd); + + if (last_zone != zone) { + BUG_ON(last_zone != zone + pd->settings.size); + + split = bio_split(bio, last_zone - + bio->bi_iter.bi_sector, + GFP_NOIO, &pkt_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } + + pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); + } while (split != bio); + + return; +end_io: + bio_io_error(bio); +} + +static void pkt_init_queue(struct pktcdvd_device *pd) +{ + struct request_queue *q = pd->disk->queue; + + blk_queue_logical_block_size(q, CD_FRAMESIZE); + blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); + q->queuedata = pd; +} + +static int pkt_seq_show(struct seq_file *m, void *p) +{ + struct pktcdvd_device *pd = m->private; + char *msg; + int states[PACKET_NUM_STATES]; + + seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev); + + seq_printf(m, "\nSettings:\n"); + seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); + + if (pd->settings.write_type == 0) + msg = "Packet"; + else + msg = "Unknown"; + seq_printf(m, "\twrite type:\t\t%s\n", msg); + + seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); + seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); + + seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); + + if (pd->settings.block_mode == PACKET_BLOCK_MODE1) + msg = "Mode 1"; + else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) + msg = "Mode 2"; + else + msg = "Unknown"; + seq_printf(m, "\tblock mode:\t\t%s\n", msg); + + seq_printf(m, "\nStatistics:\n"); + seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); + seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); + seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); + seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); + seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); + + seq_printf(m, "\nMisc:\n"); + seq_printf(m, "\treference count:\t%d\n", pd->refcnt); + seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); + seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); + seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); + seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); + seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); + + seq_printf(m, "\nQueue state:\n"); + seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); + seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); + seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector); + + pkt_count_states(pd, states); + seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], states[3], states[4], states[5]); + + seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", + pd->write_congestion_off, + pd->write_congestion_on); + return 0; +} + +static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) +{ + int i; + struct block_device *bdev; + struct scsi_device *sdev; + + if (pd->pkt_dev == dev) { + pkt_err(pd, "recursive setup not allowed\n"); + return -EBUSY; + } + for (i = 0; i < MAX_WRITERS; i++) { + struct pktcdvd_device *pd2 = pkt_devs[i]; + if (!pd2) + continue; + if (pd2->bdev->bd_dev == dev) { + pkt_err(pd, "%pg already setup\n", pd2->bdev); + return -EBUSY; + } + if (pd2->pkt_dev == dev) { + pkt_err(pd, "can't chain pktcdvd devices\n"); + return -EBUSY; + } + } + + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + sdev = scsi_device_from_queue(bdev->bd_disk->queue); + if (!sdev) { + blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); + return -EINVAL; + } + put_device(&sdev->sdev_gendev); + + /* This is safe, since we have a reference from open(). */ + __module_get(THIS_MODULE); + + pd->bdev = bdev; + set_blocksize(bdev, CD_FRAMESIZE); + + pkt_init_queue(pd); + + atomic_set(&pd->cdrw.pending_bios, 0); + pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); + if (IS_ERR(pd->cdrw.thread)) { + pkt_err(pd, "can't start kernel thread\n"); + goto out_mem; + } + + proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd); + pkt_dbg(1, pd, "writer mapped to %pg\n", bdev); + return 0; + +out_mem: + blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return -ENOMEM; +} + +static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +{ + struct pktcdvd_device *pd = bdev->bd_disk->private_data; + int ret; + + pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", + cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + + mutex_lock(&pktcdvd_mutex); + switch (cmd) { + case CDROMEJECT: + /* + * The door gets locked when the device is opened, so we + * have to unlock it or else the eject command fails. + */ + if (pd->refcnt == 1) + pkt_lock_door(pd, 0); + fallthrough; + /* + * forward selected CDROM ioctls to CD-ROM, for UDF + */ + case CDROMMULTISESSION: + case CDROMREADTOCENTRY: + case CDROM_LAST_WRITTEN: + case CDROM_SEND_PACKET: + case SCSI_IOCTL_SEND_COMMAND: + if (!bdev->bd_disk->fops->ioctl) + ret = -ENOTTY; + else + ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); + break; + default: + pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); + ret = -ENOTTY; + } + mutex_unlock(&pktcdvd_mutex); + + return ret; +} + +static unsigned int pkt_check_events(struct gendisk *disk, + unsigned int clearing) +{ + struct pktcdvd_device *pd = disk->private_data; + struct gendisk *attached_disk; + + if (!pd) + return 0; + if (!pd->bdev) + return 0; + attached_disk = pd->bdev->bd_disk; + if (!attached_disk || !attached_disk->fops->check_events) + return 0; + return attached_disk->fops->check_events(attached_disk, clearing); +} + +static char *pkt_devnode(struct gendisk *disk, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name); +} + +static const struct block_device_operations pktcdvd_ops = { + .owner = THIS_MODULE, + .submit_bio = pkt_submit_bio, + .open = pkt_open, + .release = pkt_close, + .ioctl = pkt_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, + .check_events = pkt_check_events, + .devnode = pkt_devnode, +}; + +/* + * Set up mapping from pktcdvd device to CD-ROM device. + */ +static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) +{ + int idx; + int ret = -ENOMEM; + struct pktcdvd_device *pd; + struct gendisk *disk; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + for (idx = 0; idx < MAX_WRITERS; idx++) + if (!pkt_devs[idx]) + break; + if (idx == MAX_WRITERS) { + pr_err("max %d writers supported\n", MAX_WRITERS); + ret = -EBUSY; + goto out_mutex; + } + + pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL); + if (!pd) + goto out_mutex; + + ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE, + sizeof(struct pkt_rb_node)); + if (ret) + goto out_mem; + + INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); + INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); + spin_lock_init(&pd->cdrw.active_list_lock); + + spin_lock_init(&pd->lock); + spin_lock_init(&pd->iosched.lock); + bio_list_init(&pd->iosched.read_queue); + bio_list_init(&pd->iosched.write_queue); + sprintf(pd->name, DRIVER_NAME"%d", idx); + init_waitqueue_head(&pd->wqueue); + pd->bio_queue = RB_ROOT; + + pd->write_congestion_on = write_congestion_on; + pd->write_congestion_off = write_congestion_off; + + ret = -ENOMEM; + disk = blk_alloc_disk(NUMA_NO_NODE); + if (!disk) + goto out_mem; + pd->disk = disk; + disk->major = pktdev_major; + disk->first_minor = idx; + disk->minors = 1; + disk->fops = &pktcdvd_ops; + disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; + strcpy(disk->disk_name, pd->name); + disk->private_data = pd; + + pd->pkt_dev = MKDEV(pktdev_major, idx); + ret = pkt_new_dev(pd, dev); + if (ret) + goto out_mem2; + + /* inherit events of the host device */ + disk->events = pd->bdev->bd_disk->events; + + ret = add_disk(disk); + if (ret) + goto out_mem2; + + pkt_sysfs_dev_new(pd); + pkt_debugfs_dev_new(pd); + + pkt_devs[idx] = pd; + if (pkt_dev) + *pkt_dev = pd->pkt_dev; + + mutex_unlock(&ctl_mutex); + return 0; + +out_mem2: + put_disk(disk); +out_mem: + mempool_exit(&pd->rb_pool); + kfree(pd); +out_mutex: + mutex_unlock(&ctl_mutex); + pr_err("setup of pktcdvd device failed\n"); + return ret; +} + +/* + * Tear down mapping from pktcdvd device to CD-ROM device. + */ +static int pkt_remove_dev(dev_t pkt_dev) +{ + struct pktcdvd_device *pd; + int idx; + int ret = 0; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + for (idx = 0; idx < MAX_WRITERS; idx++) { + pd = pkt_devs[idx]; + if (pd && (pd->pkt_dev == pkt_dev)) + break; + } + if (idx == MAX_WRITERS) { + pr_debug("dev not setup\n"); + ret = -ENXIO; + goto out; + } + + if (pd->refcnt > 0) { + ret = -EBUSY; + goto out; + } + if (!IS_ERR(pd->cdrw.thread)) + kthread_stop(pd->cdrw.thread); + + pkt_devs[idx] = NULL; + + pkt_debugfs_dev_remove(pd); + pkt_sysfs_dev_remove(pd); + + blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); + + remove_proc_entry(pd->name, pkt_proc); + pkt_dbg(1, pd, "writer unmapped\n"); + + del_gendisk(pd->disk); + put_disk(pd->disk); + + mempool_exit(&pd->rb_pool); + kfree(pd); + + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + +out: + mutex_unlock(&ctl_mutex); + return ret; +} + +static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) +{ + struct pktcdvd_device *pd; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index); + if (pd) { + ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev); + ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev); + } else { + ctrl_cmd->dev = 0; + ctrl_cmd->pkt_dev = 0; + } + ctrl_cmd->num_devices = MAX_WRITERS; + + mutex_unlock(&ctl_mutex); +} + +static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct pkt_ctrl_command ctrl_cmd; + int ret = 0; + dev_t pkt_dev = 0; + + if (cmd != PACKET_CTRL_CMD) + return -ENOTTY; + + if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command))) + return -EFAULT; + + switch (ctrl_cmd.command) { + case PKT_CTRL_CMD_SETUP: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev); + ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev); + break; + case PKT_CTRL_CMD_TEARDOWN: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev)); + break; + case PKT_CTRL_CMD_STATUS: + pkt_get_status(&ctrl_cmd); + break; + default: + return -ENOTTY; + } + + if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command))) + return -EFAULT; + return ret; +} + +#ifdef CONFIG_COMPAT +static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations pkt_ctl_fops = { + .open = nonseekable_open, + .unlocked_ioctl = pkt_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pkt_ctl_compat_ioctl, +#endif + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +static struct miscdevice pkt_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DRIVER_NAME, + .nodename = "pktcdvd/control", + .fops = &pkt_ctl_fops +}; + +static int __init pkt_init(void) +{ + int ret; + + mutex_init(&ctl_mutex); + + ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE, + sizeof(struct packet_stacked_data)); + if (ret) + return ret; + ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0); + if (ret) { + mempool_exit(&psd_pool); + return ret; + } + + ret = register_blkdev(pktdev_major, DRIVER_NAME); + if (ret < 0) { + pr_err("unable to register block device\n"); + goto out2; + } + if (!pktdev_major) + pktdev_major = ret; + + ret = pkt_sysfs_init(); + if (ret) + goto out; + + pkt_debugfs_init(); + + ret = misc_register(&pkt_misc); + if (ret) { + pr_err("unable to register misc device\n"); + goto out_misc; + } + + pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL); + + return 0; + +out_misc: + pkt_debugfs_cleanup(); + pkt_sysfs_cleanup(); +out: + unregister_blkdev(pktdev_major, DRIVER_NAME); +out2: + mempool_exit(&psd_pool); + bioset_exit(&pkt_bio_set); + return ret; +} + +static void __exit pkt_exit(void) +{ + remove_proc_entry("driver/"DRIVER_NAME, NULL); + misc_deregister(&pkt_misc); + + pkt_debugfs_cleanup(); + pkt_sysfs_cleanup(); + + unregister_blkdev(pktdev_major, DRIVER_NAME); + mempool_exit(&psd_pool); + bioset_exit(&pkt_bio_set); +} + +MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); +MODULE_AUTHOR("Jens Axboe "); +MODULE_LICENSE("GPL"); + +module_init(pkt_init); +module_exit(pkt_exit); diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h new file mode 100644 index 000000000000..f9c5ac80d59b --- /dev/null +++ b/include/linux/pktcdvd.h @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2000 Jens Axboe + * Copyright (C) 2001-2004 Peter Osterlund + * + * May be copied or modified under the terms of the GNU General Public + * License. See linux/COPYING for more information. + * + * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and + * DVD-RW devices. + * + */ +#ifndef __PKTCDVD_H +#define __PKTCDVD_H + +#include +#include +#include +#include +#include +#include +#include + +/* default bio write queue congestion marks */ +#define PKT_WRITE_CONGESTION_ON 10000 +#define PKT_WRITE_CONGESTION_OFF 9000 + + +struct packet_settings +{ + __u32 size; /* packet size in (512 byte) sectors */ + __u8 fp; /* fixed packets */ + __u8 link_loss; /* the rest is specified + * as per Mt Fuji */ + __u8 write_type; + __u8 track_mode; + __u8 block_mode; +}; + +/* + * Very crude stats for now + */ +struct packet_stats +{ + unsigned long pkt_started; + unsigned long pkt_ended; + unsigned long secs_w; + unsigned long secs_rg; + unsigned long secs_r; +}; + +struct packet_cdrw +{ + struct list_head pkt_free_list; + struct list_head pkt_active_list; + spinlock_t active_list_lock; /* Serialize access to pkt_active_list */ + struct task_struct *thread; + atomic_t pending_bios; +}; + +/* + * Switch to high speed reading after reading this many kilobytes + * with no interspersed writes. + */ +#define HI_SPEED_SWITCH 512 + +struct packet_iosched +{ + atomic_t attention; /* Set to non-zero when queue processing is needed */ + int writing; /* Non-zero when writing, zero when reading */ + spinlock_t lock; /* Protecting read/write queue manipulations */ + struct bio_list read_queue; + struct bio_list write_queue; + sector_t last_write; /* The sector where the last write ended */ + int successive_reads; +}; + +/* + * 32 buffers of 2048 bytes + */ +#if (PAGE_SIZE % CD_FRAMESIZE) != 0 +#error "PAGE_SIZE must be a multiple of CD_FRAMESIZE" +#endif +#define PACKET_MAX_SIZE 128 +#define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE) +#define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9) + +enum packet_data_state { + PACKET_IDLE_STATE, /* Not used at the moment */ + PACKET_WAITING_STATE, /* Waiting for more bios to arrive, so */ + /* we don't have to do as much */ + /* data gathering */ + PACKET_READ_WAIT_STATE, /* Waiting for reads to fill in holes */ + PACKET_WRITE_WAIT_STATE, /* Waiting for the write to complete */ + PACKET_RECOVERY_STATE, /* Recover after read/write errors */ + PACKET_FINISHED_STATE, /* After write has finished */ + + PACKET_NUM_STATES /* Number of possible states */ +}; + +/* + * Information needed for writing a single packet + */ +struct pktcdvd_device; + +struct packet_data +{ + struct list_head list; + + spinlock_t lock; /* Lock protecting state transitions and */ + /* orig_bios list */ + + struct bio_list orig_bios; /* Original bios passed to pkt_make_request */ + /* that will be handled by this packet */ + int write_size; /* Total size of all bios in the orig_bios */ + /* list, measured in number of frames */ + + struct bio *w_bio; /* The bio we will send to the real CD */ + /* device once we have all data for the */ + /* packet we are going to write */ + sector_t sector; /* First sector in this packet */ + int frames; /* Number of frames in this packet */ + + enum packet_data_state state; /* Current state */ + atomic_t run_sm; /* Incremented whenever the state */ + /* machine needs to be run */ + long sleep_time; /* Set this to non-zero to make the state */ + /* machine run after this many jiffies. */ + + atomic_t io_wait; /* Number of pending IO operations */ + atomic_t io_errors; /* Number of read/write errors during IO */ + + struct bio *r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */ + struct page *pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE]; + + int cache_valid; /* If non-zero, the data for the zone defined */ + /* by the sector variable is completely cached */ + /* in the pages[] vector. */ + + int id; /* ID number for debugging */ + struct pktcdvd_device *pd; +}; + +struct pkt_rb_node { + struct rb_node rb_node; + struct bio *bio; +}; + +struct packet_stacked_data +{ + struct bio *bio; /* Original read request bio */ + struct pktcdvd_device *pd; +}; +#define PSD_POOL_SIZE 64 + +struct pktcdvd_device +{ + struct block_device *bdev; /* dev attached */ + dev_t pkt_dev; /* our dev */ + char name[20]; + struct packet_settings settings; + struct packet_stats stats; + int refcnt; /* Open count */ + int write_speed; /* current write speed, kB/s */ + int read_speed; /* current read speed, kB/s */ + unsigned long offset; /* start offset */ + __u8 mode_offset; /* 0 / 8 */ + __u8 type; + unsigned long flags; + __u16 mmc3_profile; + __u32 nwa; /* next writable address */ + __u32 lra; /* last recorded address */ + struct packet_cdrw cdrw; + wait_queue_head_t wqueue; + + spinlock_t lock; /* Serialize access to bio_queue */ + struct rb_root bio_queue; /* Work queue of bios we need to handle */ + int bio_queue_size; /* Number of nodes in bio_queue */ + bool congested; /* Someone is waiting for bio_queue_size + * to drop. */ + sector_t current_sector; /* Keep track of where the elevator is */ + atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ + /* needs to be run. */ + mempool_t rb_pool; /* mempool for pkt_rb_node allocations */ + + struct packet_iosched iosched; + struct gendisk *disk; + + int write_congestion_off; + int write_congestion_on; + + struct device *dev; /* sysfs pktcdvd[0-7] dev */ + + struct dentry *dfs_d_root; /* debugfs: devname directory */ + struct dentry *dfs_f_info; /* debugfs: info file */ +}; + +#endif /* __PKTCDVD_H */ diff --git a/include/uapi/linux/pktcdvd.h b/include/uapi/linux/pktcdvd.h new file mode 100644 index 000000000000..9cbb55d21c94 --- /dev/null +++ b/include/uapi/linux/pktcdvd.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2000 Jens Axboe + * Copyright (C) 2001-2004 Peter Osterlund + * + * May be copied or modified under the terms of the GNU General Public + * License. See linux/COPYING for more information. + * + * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and + * DVD-RW devices. + * + */ +#ifndef _UAPI__PKTCDVD_H +#define _UAPI__PKTCDVD_H + +#include + +/* + * 1 for normal debug messages, 2 is very verbose. 0 to turn it off. + */ +#define PACKET_DEBUG 1 + +#define MAX_WRITERS 8 + +#define PKT_RB_POOL_SIZE 512 + +/* + * How long we should hold a non-full packet before starting data gathering. + */ +#define PACKET_WAIT_TIME (HZ * 5 / 1000) + +/* + * use drive write caching -- we need deferred error handling to be + * able to successfully recover with this option (drive will return good + * status as soon as the cdb is validated). + */ +#if defined(CONFIG_CDROM_PKTCDVD_WCACHE) +#define USE_WCACHING 1 +#else +#define USE_WCACHING 0 +#endif + +/* + * No user-servicable parts beyond this point -> + */ + +/* + * device types + */ +#define PACKET_CDR 1 +#define PACKET_CDRW 2 +#define PACKET_DVDR 3 +#define PACKET_DVDRW 4 + +/* + * flags + */ +#define PACKET_WRITABLE 1 /* pd is writable */ +#define PACKET_NWA_VALID 2 /* next writable address valid */ +#define PACKET_LRA_VALID 3 /* last recorded address valid */ +#define PACKET_MERGE_SEGS 4 /* perform segment merging to keep */ + /* underlying cdrom device happy */ + +/* + * Disc status -- from READ_DISC_INFO + */ +#define PACKET_DISC_EMPTY 0 +#define PACKET_DISC_INCOMPLETE 1 +#define PACKET_DISC_COMPLETE 2 +#define PACKET_DISC_OTHER 3 + +/* + * write type, and corresponding data block type + */ +#define PACKET_MODE1 1 +#define PACKET_MODE2 2 +#define PACKET_BLOCK_MODE1 8 +#define PACKET_BLOCK_MODE2 10 + +/* + * Last session/border status + */ +#define PACKET_SESSION_EMPTY 0 +#define PACKET_SESSION_INCOMPLETE 1 +#define PACKET_SESSION_RESERVED 2 +#define PACKET_SESSION_COMPLETE 3 + +#define PACKET_MCN "4a656e734178626f65323030300000" + +#undef PACKET_USE_LS + +#define PKT_CTRL_CMD_SETUP 0 +#define PKT_CTRL_CMD_TEARDOWN 1 +#define PKT_CTRL_CMD_STATUS 2 + +struct pkt_ctrl_command { + __u32 command; /* in: Setup, teardown, status */ + __u32 dev_index; /* in/out: Device index */ + __u32 dev; /* in/out: Device nr for cdrw device */ + __u32 pkt_dev; /* in/out: Device nr for packet device */ + __u32 num_devices; /* out: Largest device index + 1 */ + __u32 padding; /* Not used */ +}; + +/* + * packet ioctls + */ +#define PACKET_IOCTL_MAGIC ('X') +#define PACKET_CTRL_CMD _IOWR(PACKET_IOCTL_MAGIC, 1, struct pkt_ctrl_command) + + +#endif /* _UAPI__PKTCDVD_H */ -- cgit From 6fe6ece398f7431784847e922a2c8c385dc58a35 Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Wed, 21 Dec 2022 16:24:13 +0100 Subject: Revert "drm/amd/display: Enable Freesync Video Mode by default" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit de05abe6b9d0fe08f65d744f7f75a4cba4df27ad. The bug referenced below was bisected to this commit. There has been no activity toward fixing it in 3 months, so let's revert for now. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2162 Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 27 +++++++++++++++++++++++ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 +++++----- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 6b74df446694..e3e2e6e3b485 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -195,6 +195,7 @@ extern int amdgpu_emu_mode; extern uint amdgpu_smu_memory_pool_size; extern int amdgpu_smu_pptable_id; extern uint amdgpu_dc_feature_mask; +extern uint amdgpu_freesync_vid_mode; extern uint amdgpu_dc_debug_mask; extern uint amdgpu_dc_visual_confirm; extern uint amdgpu_dm_abm_level; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b4f2d61ea0d5..1353ffd08988 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -181,6 +181,7 @@ int amdgpu_mes_kiq; int amdgpu_noretry = -1; int amdgpu_force_asic_type = -1; int amdgpu_tmz = -1; /* auto */ +uint amdgpu_freesync_vid_mode; int amdgpu_reset_method = -1; /* auto */ int amdgpu_num_kcq = -1; int amdgpu_smartshift_bias; @@ -879,6 +880,32 @@ module_param_named(backlight, amdgpu_backlight, bint, 0444); MODULE_PARM_DESC(tmz, "Enable TMZ feature (-1 = auto (default), 0 = off, 1 = on)"); module_param_named(tmz, amdgpu_tmz, int, 0444); +/** + * DOC: freesync_video (uint) + * Enable the optimization to adjust front porch timing to achieve seamless + * mode change experience when setting a freesync supported mode for which full + * modeset is not needed. + * + * The Display Core will add a set of modes derived from the base FreeSync + * video mode into the corresponding connector's mode list based on commonly + * used refresh rates and VRR range of the connected display, when users enable + * this feature. From the userspace perspective, they can see a seamless mode + * change experience when the change between different refresh rates under the + * same resolution. Additionally, userspace applications such as Video playback + * can read this modeset list and change the refresh rate based on the video + * frame rate. Finally, the userspace can also derive an appropriate mode for a + * particular refresh rate based on the FreeSync Mode and add it to the + * connector's mode list. + * + * Note: This is an experimental feature. + * + * The default value: 0 (off). + */ +MODULE_PARM_DESC( + freesync_video, + "Enable freesync modesetting optimization feature (0 = off (default), 1 = on)"); +module_param_named(freesync_video, amdgpu_freesync_vid_mode, uint, 0444); + /** * DOC: reset_method (int) * GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 86bc23a67d97..1b7f20a9d4ae 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -5835,7 +5835,8 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector, */ DRM_DEBUG_DRIVER("No preferred mode found\n"); } else { - recalculate_timing = is_freesync_video_mode(&mode, aconnector); + recalculate_timing = amdgpu_freesync_vid_mode && + is_freesync_video_mode(&mode, aconnector); if (recalculate_timing) { freesync_mode = get_highest_refresh_rate_mode(aconnector, false); drm_mode_copy(&saved_mode, &mode); @@ -6986,7 +6987,7 @@ static void amdgpu_dm_connector_add_freesync_modes(struct drm_connector *connect struct amdgpu_dm_connector *amdgpu_dm_connector = to_amdgpu_dm_connector(connector); - if (!edid) + if (!(amdgpu_freesync_vid_mode && edid)) return; if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10) @@ -8850,7 +8851,8 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, * TODO: Refactor this function to allow this check to work * in all conditions. */ - if (dm_new_crtc_state->stream && + if (amdgpu_freesync_vid_mode && + dm_new_crtc_state->stream && is_timing_unchanged_for_freesync(new_crtc_state, old_crtc_state)) goto skip_modeset; @@ -8885,7 +8887,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, if (!dm_old_crtc_state->stream) goto skip_modeset; - if (dm_new_crtc_state->stream && + if (amdgpu_freesync_vid_mode && dm_new_crtc_state->stream && is_timing_unchanged_for_freesync(new_crtc_state, old_crtc_state)) { new_crtc_state->mode_changed = false; @@ -8897,7 +8899,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, set_freesync_fixed_config(dm_new_crtc_state); goto skip_modeset; - } else if (aconnector && + } else if (amdgpu_freesync_vid_mode && aconnector && is_freesync_video_mode(&new_crtc_state->mode, aconnector)) { struct drm_display_mode *high_mode; -- cgit From bd0ddcfc83d85bc30c868f2c3457312c7f1ccee2 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 3 Jan 2023 12:00:31 -0600 Subject: Revert "of: fdt: Honor CONFIG_CMDLINE* even without /chosen node" This reverts commit a7d550f82b445cf218b47a2c1a9c56e97ecb8c7a. Some arches (PPC at least) don't call early_init_dt_scan_nodes(), so moving the cmdline processing there breaks them. Reported-by: Geoff Levand Cc: Alexander Sverdlin Tested-by: Geoff Levand Reviewed-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20230103-dt-cmdline-fix-v1-1-7038e88b18b6@kernel.org Signed-off-by: Rob Herring --- drivers/of/fdt.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 02cc4a285cb9..4f88e8bbdd27 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1175,6 +1175,26 @@ int __init early_init_dt_scan_chosen(char *cmdline) if (p != NULL && l > 0) strscpy(cmdline, p, min(l, COMMAND_LINE_SIZE)); + /* + * CONFIG_CMDLINE is meant to be a default in case nothing else + * managed to set the command line, unless CONFIG_CMDLINE_FORCE + * is set in which case we override whatever was found earlier. + */ +#ifdef CONFIG_CMDLINE +#if defined(CONFIG_CMDLINE_EXTEND) + strlcat(cmdline, " ", COMMAND_LINE_SIZE); + strlcat(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); +#elif defined(CONFIG_CMDLINE_FORCE) + strscpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); +#else + /* No arguments from boot loader, use kernel's cmdl*/ + if (!((char *)cmdline)[0]) + strscpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); +#endif +#endif /* CONFIG_CMDLINE */ + + pr_debug("Command line is: %s\n", (char *)cmdline); + rng_seed = of_get_flat_dt_prop(node, "rng-seed", &l); if (rng_seed && l > 0) { add_bootloader_randomness(rng_seed, l); @@ -1279,26 +1299,6 @@ void __init early_init_dt_scan_nodes(void) if (rc) pr_warn("No chosen node found, continuing without\n"); - /* - * CONFIG_CMDLINE is meant to be a default in case nothing else - * managed to set the command line, unless CONFIG_CMDLINE_FORCE - * is set in which case we override whatever was found earlier. - */ -#ifdef CONFIG_CMDLINE -#if defined(CONFIG_CMDLINE_EXTEND) - strlcat(boot_command_line, " ", COMMAND_LINE_SIZE); - strlcat(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#elif defined(CONFIG_CMDLINE_FORCE) - strscpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#else - /* No arguments from boot loader, use kernel's cmdl */ - if (!boot_command_line[0]) - strscpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif -#endif /* CONFIG_CMDLINE */ - - pr_debug("Command line is: %s\n", boot_command_line); - /* Setup memory, calling early_init_dt_add_memory_arch */ early_init_dt_scan_memory(); -- cgit From 064e32dc5b03114d0767893fecdaf7b5dfd8c286 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 3 Jan 2023 12:00:32 -0600 Subject: of: fdt: Honor CONFIG_CMDLINE* even without /chosen node, take 2 I do not read a strict requirement on /chosen node in either ePAPR or in Documentation/devicetree. Help text for CONFIG_CMDLINE and CONFIG_CMDLINE_EXTEND doesn't make their behavior explicitly dependent on the presence of /chosen or the presense of /chosen/bootargs. However the early check for /chosen and bailing out in early_init_dt_scan_chosen() skips CONFIG_CMDLINE handling which is not really related to /chosen node or the particular method of passing cmdline from bootloader. This leads to counterintuitive combinations (assuming CONFIG_CMDLINE_EXTEND=y): a) bootargs="foo", CONFIG_CMDLINE="bar" => cmdline=="foo bar" b) /chosen missing, CONFIG_CMDLINE="bar" => cmdline=="" c) bootargs="", CONFIG_CMDLINE="bar" => cmdline==" bar" Rework early_init_dt_scan_chosen() so that the cmdline config options are always handled. [commit msg written by Alexander Sverdlin] Cc: Alexander Sverdlin Cc: Linus Walleij Cc: Arnd Bergmann Tested-by: Geoff Levand Reviewed-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20230103-dt-cmdline-fix-v1-2-7038e88b18b6@kernel.org Signed-off-by: Rob Herring --- drivers/of/fdt.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 4f88e8bbdd27..f08b25195ae7 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1163,18 +1163,32 @@ int __init early_init_dt_scan_chosen(char *cmdline) if (node < 0) node = fdt_path_offset(fdt, "/chosen@0"); if (node < 0) - return -ENOENT; + /* Handle the cmdline config options even if no /chosen node */ + goto handle_cmdline; chosen_node_offset = node; early_init_dt_check_for_initrd(node); early_init_dt_check_for_elfcorehdr(node); + rng_seed = of_get_flat_dt_prop(node, "rng-seed", &l); + if (rng_seed && l > 0) { + add_bootloader_randomness(rng_seed, l); + + /* try to clear seed so it won't be found. */ + fdt_nop_property(initial_boot_params, node, "rng-seed"); + + /* update CRC check value */ + of_fdt_crc32 = crc32_be(~0, initial_boot_params, + fdt_totalsize(initial_boot_params)); + } + /* Retrieve command line */ p = of_get_flat_dt_prop(node, "bootargs", &l); if (p != NULL && l > 0) strscpy(cmdline, p, min(l, COMMAND_LINE_SIZE)); +handle_cmdline: /* * CONFIG_CMDLINE is meant to be a default in case nothing else * managed to set the command line, unless CONFIG_CMDLINE_FORCE @@ -1195,18 +1209,6 @@ int __init early_init_dt_scan_chosen(char *cmdline) pr_debug("Command line is: %s\n", (char *)cmdline); - rng_seed = of_get_flat_dt_prop(node, "rng-seed", &l); - if (rng_seed && l > 0) { - add_bootloader_randomness(rng_seed, l); - - /* try to clear seed so it won't be found. */ - fdt_nop_property(initial_boot_params, node, "rng-seed"); - - /* update CRC check value */ - of_fdt_crc32 = crc32_be(~0, initial_boot_params, - fdt_totalsize(initial_boot_params)); - } - return 0; } -- cgit From 5401c3e0992860b11fb4b25796e4c4f1921740df Mon Sep 17 00:00:00 2001 From: Caleb Sander Date: Tue, 3 Jan 2023 16:30:21 -0700 Subject: qed: allow sleep in qed_mcp_trace_dump() By default, qed_mcp_cmd_and_union() delays 10us at a time in a loop that can run 500K times, so calls to qed_mcp_nvm_rd_cmd() may block the current thread for over 5s. We observed thread scheduling delays over 700ms in production, with stacktraces pointing to this code as the culprit. qed_mcp_trace_dump() is called from ethtool, so sleeping is permitted. It already can sleep in qed_mcp_halt(), which calls qed_mcp_cmd(). Add a "can sleep" parameter to qed_find_nvram_image() and qed_nvram_read() so they can sleep during qed_mcp_trace_dump(). qed_mcp_trace_get_meta_info() and qed_mcp_trace_read_meta(), called only by qed_mcp_trace_dump(), allow these functions to sleep. I can't tell if the other caller (qed_grc_dump_mcp_hw_dump()) can sleep, so keep b_can_sleep set to false when it calls these functions. An example stacktrace from a custom warning we added to the kernel showing a thread that has not scheduled despite long needing resched: [ 2745.362925,17] ------------[ cut here ]------------ [ 2745.362941,17] WARNING: CPU: 23 PID: 5640 at arch/x86/kernel/irq.c:233 do_IRQ+0x15e/0x1a0() [ 2745.362946,17] Thread not rescheduled for 744 ms after irq 99 [ 2745.362956,17] Modules linked in: ... [ 2745.363339,17] CPU: 23 PID: 5640 Comm: lldpd Tainted: P O 4.4.182+ #202104120910+6d1da174272d.61x [ 2745.363343,17] Hardware name: FOXCONN MercuryB/Quicksilver Controller, BIOS H11P1N09 07/08/2020 [ 2745.363346,17] 0000000000000000 ffff885ec07c3ed8 ffffffff8131eb2f ffff885ec07c3f20 [ 2745.363358,17] ffffffff81d14f64 ffff885ec07c3f10 ffffffff81072ac2 ffff88be98ed0000 [ 2745.363369,17] 0000000000000063 0000000000000174 0000000000000074 0000000000000000 [ 2745.363379,17] Call Trace: [ 2745.363382,17] [] dump_stack+0x8e/0xcf [ 2745.363393,17] [] warn_slowpath_common+0x82/0xc0 [ 2745.363398,17] [] warn_slowpath_fmt+0x4c/0x50 [ 2745.363404,17] [] ? rcu_irq_exit+0xae/0xc0 [ 2745.363408,17] [] do_IRQ+0x15e/0x1a0 [ 2745.363413,17] [] common_interrupt+0x89/0x89 [ 2745.363416,17] [] ? delay_tsc+0x24/0x50 [ 2745.363425,17] [] __udelay+0x34/0x40 [ 2745.363457,17] [] qed_mcp_cmd_and_union+0x36f/0x7d0 [qed] [ 2745.363473,17] [] qed_mcp_nvm_rd_cmd+0x4d/0x90 [qed] [ 2745.363490,17] [] qed_mcp_trace_dump+0x4a7/0x630 [qed] [ 2745.363504,17] [] ? qed_fw_asserts_dump+0x1d6/0x1f0 [qed] [ 2745.363520,17] [] qed_dbg_mcp_trace_get_dump_buf_size+0x37/0x80 [qed] [ 2745.363536,17] [] qed_dbg_feature_size+0x61/0xa0 [qed] [ 2745.363551,17] [] qed_dbg_all_data_size+0x247/0x260 [qed] [ 2745.363560,17] [] qede_get_regs_len+0x30/0x40 [qede] [ 2745.363566,17] [] ethtool_get_drvinfo+0xe3/0x190 [ 2745.363570,17] [] dev_ethtool+0x1362/0x2140 [ 2745.363575,17] [] ? finish_task_switch+0x76/0x260 [ 2745.363580,17] [] ? __schedule+0x3c6/0x9d0 [ 2745.363585,17] [] ? hrtimer_start_range_ns+0x1d0/0x370 [ 2745.363589,17] [] ? dev_get_by_name_rcu+0x6b/0x90 [ 2745.363594,17] [] dev_ioctl+0xe8/0x710 [ 2745.363599,17] [] sock_do_ioctl+0x48/0x60 [ 2745.363603,17] [] sock_ioctl+0x1c7/0x280 [ 2745.363608,17] [] ? seccomp_phase1+0x83/0x220 [ 2745.363612,17] [] do_vfs_ioctl+0x2b3/0x4e0 [ 2745.363616,17] [] SyS_ioctl+0x41/0x70 [ 2745.363619,17] [] entry_SYSCALL_64_fastpath+0x1e/0x79 [ 2745.363622,17] ---[ end trace f6954aa440266421 ]--- Fixes: c965db4446291 ("qed: Add support for debug data collection") Signed-off-by: Caleb Sander Acked-by: Alok Prasad Link: https://lore.kernel.org/r/20230103233021.1457646-1-csander@purestorage.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 86ecb080b153..cdcead614e9f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -1832,7 +1832,8 @@ static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u32 image_type, u32 *nvram_offset_bytes, - u32 *nvram_size_bytes) + u32 *nvram_size_bytes, + bool b_can_sleep) { u32 ret_mcp_resp, ret_mcp_param, ret_txn_size; struct mcp_file_att file_att; @@ -1846,7 +1847,8 @@ static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn, &ret_mcp_resp, &ret_mcp_param, &ret_txn_size, - (u32 *)&file_att, false); + (u32 *)&file_att, + b_can_sleep); /* Check response */ if (nvm_result || (ret_mcp_resp & FW_MSG_CODE_MASK) != @@ -1873,7 +1875,9 @@ static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn, static enum dbg_status qed_nvram_read(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u32 nvram_offset_bytes, - u32 nvram_size_bytes, u32 *ret_buf) + u32 nvram_size_bytes, + u32 *ret_buf, + bool b_can_sleep) { u32 ret_mcp_resp, ret_mcp_param, ret_read_size, bytes_to_copy; s32 bytes_left = nvram_size_bytes; @@ -1899,7 +1903,7 @@ static enum dbg_status qed_nvram_read(struct qed_hwfn *p_hwfn, &ret_mcp_resp, &ret_mcp_param, &ret_read_size, (u32 *)((u8 *)ret_buf + read_offset), - false)) + b_can_sleep)) return DBG_STATUS_NVRAM_READ_FAILED; /* Check response */ @@ -3380,7 +3384,8 @@ static u32 qed_grc_dump_mcp_hw_dump(struct qed_hwfn *p_hwfn, p_ptt, NVM_TYPE_HW_DUMP_OUT, &hw_dump_offset_bytes, - &hw_dump_size_bytes); + &hw_dump_size_bytes, + false); if (status != DBG_STATUS_OK) return 0; @@ -3397,7 +3402,9 @@ static u32 qed_grc_dump_mcp_hw_dump(struct qed_hwfn *p_hwfn, status = qed_nvram_read(p_hwfn, p_ptt, hw_dump_offset_bytes, - hw_dump_size_bytes, dump_buf + offset); + hw_dump_size_bytes, + dump_buf + offset, + false); if (status != DBG_STATUS_OK) { DP_NOTICE(p_hwfn, "Failed to read MCP HW Dump image from NVRAM\n"); @@ -4123,7 +4130,9 @@ static enum dbg_status qed_mcp_trace_get_meta_info(struct qed_hwfn *p_hwfn, return qed_find_nvram_image(p_hwfn, p_ptt, nvram_image_type, - trace_meta_offset, trace_meta_size); + trace_meta_offset, + trace_meta_size, + true); } /* Reads the MCP Trace meta data from NVRAM into the specified buffer */ @@ -4139,7 +4148,10 @@ static enum dbg_status qed_mcp_trace_read_meta(struct qed_hwfn *p_hwfn, /* Read meta data from NVRAM */ status = qed_nvram_read(p_hwfn, p_ptt, - nvram_offset_in_bytes, size_in_bytes, buf); + nvram_offset_in_bytes, + size_in_bytes, + buf, + true); if (status != DBG_STATUS_OK) return status; -- cgit From 2c02d41d71f90a5168391b6a5f2954112ba2307c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 3 Jan 2023 12:19:17 +0100 Subject: net/ulp: prevent ULP without clone op from entering the LISTEN status When an ULP-enabled socket enters the LISTEN status, the listener ULP data pointer is copied inside the child/accepted sockets by sk_clone_lock(). The relevant ULP can take care of de-duplicating the context pointer via the clone() operation, but only MPTCP and SMC implement such op. Other ULPs may end-up with a double-free at socket disposal time. We can't simply clear the ULP data at clone time, as TLS replaces the socket ops with custom ones assuming a valid TLS ULP context is available. Instead completely prevent clone-less ULP sockets from entering the LISTEN status. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Reported-by: slipper Signed-off-by: Paolo Abeni Link: https://lore.kernel.org/r/4b80c3d1dbe3d0ab072f80450c202d9bc88b4b03.1672740602.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ net/ipv4/tcp_ulp.c | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 848ffc3e0239..d1f837579398 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1200,12 +1200,26 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); +static int inet_ulp_can_listen(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) + return -EINVAL; + + return 0; +} + int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; + err = inet_ulp_can_listen(sk); + if (unlikely(err)) + return err; + reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 9ae50b1bd844..05b6077b9f2c 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -139,6 +139,10 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + err = -EINVAL; + if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN) + goto out_err; + err = ulp_ops->init(sk); if (err) goto out_err; -- cgit From 1ac88557447088ccd15eb2f2520ce46d463c8e0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Jan 2023 19:27:36 +0000 Subject: inet: control sockets should not use current thread task_frag Because ICMP handlers run from softirq contexts, they must not use current thread task_frag. Previously, all sockets allocated by inet_ctl_sock_create() would use the per-socket page fragment, with no chance of recursion. Fixes: 98123866fcf3 ("Treewide: Stop corrupting socket's task_frag") Reported-by: syzbot+bebc6f1acdf4cbb79b03@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Benjamin Coddington Acked-by: Guillaume Nault Link: https://lore.kernel.org/r/20230103192736.454149-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ab4a06be489b..6c0ec2789943 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1665,6 +1665,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, if (rc == 0) { *sk = sock->sk; (*sk)->sk_allocation = GFP_ATOMIC; + (*sk)->sk_use_task_frag = false; /* * Unhash it so that IP input processing does not even see it, * we do not wish this socket to see incoming packets. -- cgit From 7246210ecdd0cda97fa3e3bb15c32c6c2d9a23b5 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 27 Dec 2022 11:29:28 +0000 Subject: cifs: refcount only the selected iface during interface update When the server interface for a channel is not active anymore, we have the logic to select an alternative interface. However this was not breaking out of the loop as soon as a new alternative was found. As a result, some interfaces may get refcounted unintentionally. There was also a bug in checking if we found an alternate iface. Fixed that too. Fixes: b54034a73baf ("cifs: during reconnect, update interface if necessary") Cc: stable@vger.kernel.org # 5.19+ Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/sess.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 9e7d9f0baa18..0b842a07e157 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -292,9 +292,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) continue; } kref_get(&iface->refcount); + break; } - if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { rc = 1; iface = NULL; cifs_dbg(FYI, "unable to find a suitable iface\n"); -- cgit From cc7d79d4fad6a4eab3f88c4bb237de72be4478f1 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Thu, 22 Dec 2022 12:54:44 +0000 Subject: cifs: fix interface count calculation during refresh The last fix to iface_count did fix the overcounting issue. However, during each refresh, we could end up undercounting the iface_count, if a match was found. Fixing this by doing increments and decrements instead of setting it to 0 before each parsing of server interfaces. Fixes: 096bbeec7bd6 ("smb3: interface count displayed incorrectly") Cc: stable@vger.kernel.org # 6.1 Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 0d7e9bcd9f34..e6bcd2baf446 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -530,7 +530,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, p = buf; spin_lock(&ses->iface_lock); - ses->iface_count = 0; /* * Go through iface_list and do kref_put to remove * any unused ifaces. ifaces in use will be removed @@ -540,6 +539,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, iface_head) { iface->is_active = 0; kref_put(&iface->refcount, release_iface); + ses->iface_count--; } spin_unlock(&ses->iface_lock); @@ -618,6 +618,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, /* just get a ref so that it doesn't get picked/freed */ iface->is_active = 1; kref_get(&iface->refcount); + ses->iface_count++; spin_unlock(&ses->iface_lock); goto next_iface; } else if (ret < 0) { -- cgit From a53da43decaa3936998fa7dce2346855a6942166 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 1 Jan 2023 15:07:09 +0900 Subject: kbuild: fix single *.ko build The single *.ko build is broken since commit f65a486821cf ("kbuild: change module.order to list *.o instead of *.ko"). Fixes: f65a486821cf ("kbuild: change module.order to list *.o instead of *.ko") Reported-by: Marc Kleine-Budde Signed-off-by: Masahiro Yamada Tested-by: Marc Kleine-Budde --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c05b4fb7121e..dfba294ae790 100644 --- a/Makefile +++ b/Makefile @@ -1986,7 +1986,7 @@ $(single-no-ko): $(build-dir) # Remove MODORDER when done because it is not the real one. PHONY += single_modules single_modules: $(single-no-ko) modules_prepare - $(Q){ $(foreach m, $(single-ko), echo $(extmod_prefix)$m;) } > $(MODORDER) + $(Q){ $(foreach m, $(single-ko), echo $(extmod_prefix)$(m:%.ko=%.o);) } > $(MODORDER) $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost ifneq ($(KBUILD_MODPOST_NOFINAL),1) $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modfinal -- cgit From 735aec59afb18c3e2da0a637037e69ad62dbda6a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 4 Jan 2023 23:04:59 +0900 Subject: kbuild: readd -w option when vmlinux.o or Module.symver is missing Commit 63ffe00d8c93 ("kbuild: Fix running modpost with musl libc") accidentally turned the unresolved symbol warnings into errors when vmlinux.o (for in-tree builds) or Module.symver (for external module builds) is missing. In those cases, unresolved symbols are expected, but the -w option is not set because 'missing-input' is referenced before set. Move $(missing-input) back to the original place. This should be fine for musl libc because vmlinux.o and -w are not added at the same time. With this change, -w may be passed twice, but it is not a big deal. Link: https://lore.kernel.org/all/b56a03b8-2a2a-f833-a5d2-cdc50a7ca2bb@cschramm.eu/ Fixes: 63ffe00d8c93 ("kbuild: Fix running modpost with musl libc") Reported-by: Christopher Schramm Signed-off-by: Masahiro Yamada Tested-by: Samuel Holland --- scripts/Makefile.modpost | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index 0ee296cf520c..43343e13c542 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -44,6 +44,7 @@ modpost-args = \ $(if $(CONFIG_MODVERSIONS),-m) \ $(if $(CONFIG_MODULE_SRCVERSION_ALL),-a) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ + $(if $(KBUILD_MODPOST_WARN),-w) \ $(if $(KBUILD_NSDEPS),-d $(MODULES_NSDEPS)) \ $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) \ -o $@ @@ -55,10 +56,6 @@ ifneq ($(findstring i,$(filter-out --%,$(MAKEFLAGS))),) modpost-args += -n endif -ifneq ($(KBUILD_MODPOST_WARN)$(missing-input),) -modpost-args += -w -endif - # Read out modules.order to pass in modpost. # Otherwise, allmodconfig would fail with "Argument list too long". ifdef KBUILD_MODULES @@ -124,6 +121,10 @@ modpost-args += -e $(addprefix -i , $(KBUILD_EXTRA_SYMBOLS)) endif # ($(KBUILD_EXTMOD),) +ifneq ($(missing-input),) +modpost-args += -w +endif + quiet_cmd_modpost = MODPOST $@ cmd_modpost = \ $(if $(missing-input), \ -- cgit From fe69230f05897b3de758427b574fc98025dfc907 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 4 Jan 2023 14:51:46 +0800 Subject: caif: fix memory leak in cfctrl_linkup_request() When linktype is unknown or kzalloc failed in cfctrl_linkup_request(), pkt is not released. Add release process to error path. Fixes: b482cd2053e3 ("net-caif: add CAIF core protocol stack") Fixes: 8d545c8f958f ("caif: Disconnect without waiting for response") Signed-off-by: Zhengchao Shao Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230104065146.1153009-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- net/caif/cfctrl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index cc405d8c7c30..8480684f2762 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -269,11 +269,15 @@ int cfctrl_linkup_request(struct cflayer *layer, default: pr_warn("Request setup of bad link type = %d\n", param->linktype); + cfpkt_destroy(pkt); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) + if (!req) { + cfpkt_destroy(pkt); return -ENOMEM; + } + req->client_layer = user_layer; req->cmd = CFCTRL_CMD_LINK_SETUP; req->param = *param; -- cgit From 634cf6ead93988b0da9ac054521ab63a3ba189db Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 18:02:28 +0100 Subject: fbdev: omapfb: avoid stack overflow warning The dsi_irq_stats structure is a little too big to fit on the stack of a 32-bit task, depending on the specific gcc options: fbdev/omap2/omapfb/dss/dsi.c: In function 'dsi_dump_dsidev_irqs': fbdev/omap2/omapfb/dss/dsi.c:1621:1: error: the frame size of 1064 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] Since this is only a debugfs file, performance is not critical, so just dynamically allocate it, and print an error message in there in place of a failure code when the allocation fails. Signed-off-by: Arnd Bergmann Signed-off-by: Helge Deller --- drivers/video/fbdev/omap2/omapfb/dss/dsi.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c index 54b0f034c2ed..7cddb7b8ae34 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c @@ -1536,22 +1536,28 @@ static void dsi_dump_dsidev_irqs(struct platform_device *dsidev, { struct dsi_data *dsi = dsi_get_dsidrv_data(dsidev); unsigned long flags; - struct dsi_irq_stats stats; + struct dsi_irq_stats *stats; + + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) { + seq_printf(s, "out of memory\n"); + return; + } spin_lock_irqsave(&dsi->irq_stats_lock, flags); - stats = dsi->irq_stats; + *stats = dsi->irq_stats; memset(&dsi->irq_stats, 0, sizeof(dsi->irq_stats)); dsi->irq_stats.last_reset = jiffies; spin_unlock_irqrestore(&dsi->irq_stats_lock, flags); seq_printf(s, "period %u ms\n", - jiffies_to_msecs(jiffies - stats.last_reset)); + jiffies_to_msecs(jiffies - stats->last_reset)); - seq_printf(s, "irqs %d\n", stats.irq_count); + seq_printf(s, "irqs %d\n", stats->irq_count); #define PIS(x) \ - seq_printf(s, "%-20s %10d\n", #x, stats.dsi_irqs[ffs(DSI_IRQ_##x)-1]) + seq_printf(s, "%-20s %10d\n", #x, stats->dsi_irqs[ffs(DSI_IRQ_##x)-1]) seq_printf(s, "-- DSI%d interrupts --\n", dsi->module_id + 1); PIS(VC0); @@ -1575,10 +1581,10 @@ static void dsi_dump_dsidev_irqs(struct platform_device *dsidev, #define PIS(x) \ seq_printf(s, "%-20s %10d %10d %10d %10d\n", #x, \ - stats.vc_irqs[0][ffs(DSI_VC_IRQ_##x)-1], \ - stats.vc_irqs[1][ffs(DSI_VC_IRQ_##x)-1], \ - stats.vc_irqs[2][ffs(DSI_VC_IRQ_##x)-1], \ - stats.vc_irqs[3][ffs(DSI_VC_IRQ_##x)-1]); + stats->vc_irqs[0][ffs(DSI_VC_IRQ_##x)-1], \ + stats->vc_irqs[1][ffs(DSI_VC_IRQ_##x)-1], \ + stats->vc_irqs[2][ffs(DSI_VC_IRQ_##x)-1], \ + stats->vc_irqs[3][ffs(DSI_VC_IRQ_##x)-1]); seq_printf(s, "-- VC interrupts --\n"); PIS(CS); @@ -1594,7 +1600,7 @@ static void dsi_dump_dsidev_irqs(struct platform_device *dsidev, #define PIS(x) \ seq_printf(s, "%-20s %10d\n", #x, \ - stats.cio_irqs[ffs(DSI_CIO_IRQ_##x)-1]); + stats->cio_irqs[ffs(DSI_CIO_IRQ_##x)-1]); seq_printf(s, "-- CIO interrupts --\n"); PIS(ERRSYNCESC1); @@ -1618,6 +1624,8 @@ static void dsi_dump_dsidev_irqs(struct platform_device *dsidev, PIS(ULPSACTIVENOT_ALL0); PIS(ULPSACTIVENOT_ALL1); #undef PIS + + kfree(stats); } static void dsi1_dump_irqs(struct seq_file *s) -- cgit From 4b9880dbf3bdba3a7c56445137c3d0e30aaa0a40 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 5 Jan 2023 22:05:04 +1100 Subject: powerpc/vmlinux.lds: Define RUNTIME_DISCARD_EXIT The powerpc linker script explicitly includes .exit.text, because otherwise the link fails due to references from __bug_table and __ex_table. The code is freed (discarded) at runtime along with .init.text and data. That has worked in the past despite powerpc not defining RUNTIME_DISCARD_EXIT because DISCARDS appears late in the powerpc linker script (line 410), and the explicit inclusion of .exit.text earlier (line 280) supersedes the discard. However commit 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") introduced an earlier use of DISCARD as part of the RO_DATA macro (line 136). With binutils < 2.36 that causes the DISCARD directives later in the script to be applied earlier [1], causing .exit.text to actually be discarded at link time, leading to build errors: '.exit.text' referenced in section '__bug_table' of crypto/algboss.o: defined in discarded section '.exit.text' of crypto/algboss.o '.exit.text' referenced in section '__ex_table' of drivers/nvdimm/core.o: defined in discarded section '.exit.text' of drivers/nvdimm/core.o Fix it by defining RUNTIME_DISCARD_EXIT, which causes the generic DISCARDS macro to not include .exit.text at all. 1: https://lore.kernel.org/lkml/87fscp2v7k.fsf@igel.home/ Fixes: 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230105132349.384666-1-mpe@ellerman.id.au --- arch/powerpc/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8c3862b4c259..c5ea7d03d539 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,7 @@ #define BSS_FIRST_SECTIONS *(.bss.prominit) #define EMITS_PT_NOTE #define RO_EXCEPTION_TABLE_ALIGN 0 +#define RUNTIME_DISCARD_EXIT #define SOFT_MASK_TABLE(align) \ . = ALIGN(align); \ -- cgit From 07b050f9290ee012a407a0f64151db902a1520f5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 5 Jan 2023 22:28:36 +1100 Subject: powerpc/vmlinux.lds: Don't discard .rela* for relocatable builds Relocatable kernels must not discard relocations, they need to be processed at runtime. As such they are included for CONFIG_RELOCATABLE builds in the powerpc linker script (line 340). However they are also unconditionally discarded later in the script (line 414). Previously that worked because the earlier inclusion superseded the discard. However commit 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") introduced an earlier use of DISCARD as part of the RO_DATA macro (line 137). With binutils < 2.36 that causes the DISCARD directives later in the script to be applied earlier, causing .rela* to actually be discarded at link time, leading to build warnings and a kernel that doesn't boot: ld: warning: discarding dynamic section .rela.init.rodata Fix it by conditionally discarding .rela* only when CONFIG_RELOCATABLE is disabled. Fixes: 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230105132349.384666-2-mpe@ellerman.id.au --- arch/powerpc/kernel/vmlinux.lds.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index c5ea7d03d539..a4c6efadc90c 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -411,9 +411,12 @@ SECTIONS DISCARDS /DISCARD/ : { *(*.EMB.apuinfo) - *(.glink .iplt .plt .rela* .comment) + *(.glink .iplt .plt .comment) *(.gnu.version*) *(.gnu.attributes) *(.eh_frame) +#ifndef CONFIG_RELOCATABLE + *(.rela*) +#endif } } -- cgit From be5f95c8779e19779dd81927c8574fec5aaba36c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 5 Jan 2023 22:42:59 +1100 Subject: powerpc/vmlinux.lds: Don't discard .comment Although the powerpc linker script mentions .comment in the DISCARD section, that has never actually caused it to be discarded, because the earlier ELF_DETAILS macro (previously STABS_DEBUG) explicitly includes .comment. However commit 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") introduced an earlier use of DISCARD as part of the RO_DATA macro. With binutils < 2.36 that causes the DISCARD directives later in the script to be applied earlier, causing .comment to actually be discarded. It's confusing to explicitly include and discard .comment, and even more so if the behaviour depends on the toolchain version. So don't discard .comment in order to maintain the existing behaviour in all cases. Fixes: 83a092cf95f2 ("powerpc: Link warning for orphan sections") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230105132349.384666-3-mpe@ellerman.id.au --- arch/powerpc/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index a4c6efadc90c..958e77a24f85 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -411,7 +411,7 @@ SECTIONS DISCARDS /DISCARD/ : { *(*.EMB.apuinfo) - *(.glink .iplt .plt .comment) + *(.glink .iplt .plt) *(.gnu.version*) *(.gnu.attributes) *(.eh_frame) -- cgit From 12521a5d5cb7ff0ad43eadfc9c135d86e1131fa8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 5 Jan 2023 10:49:15 +0000 Subject: io_uring: fix CQ waiting timeout handling Jiffy to ktime CQ waiting conversion broke how we treat timeouts, in particular we rearm it anew every time we get into io_cqring_wait_schedule() without adjusting the timeout. Waiting for 2 CQEs and getting a task_work in the middle may double the timeout value, or even worse in some cases task may wait indefinitely. Cc: stable@vger.kernel.org Fixes: 228339662b398 ("io_uring: don't convert to jiffies for waiting on timeouts") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f7bffddd71b08f28a877d44d37ac953ddb01590d.1672915663.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 472574192dd6..2ac1cd8d23ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2470,7 +2470,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - ktime_t timeout) + ktime_t *timeout) { int ret; unsigned long check_cq; @@ -2488,7 +2488,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) return -EBADR; } - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) return -ETIME; /* @@ -2564,7 +2564,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); if (__io_cqring_events_user(ctx) >= min_events) break; cond_resched(); -- cgit From decb17aeb8fa21484a0140c0696dc5a477cc5c57 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jan 2023 09:50:20 +0000 Subject: KVM: arm64: vgic: Add Apple M2 cpus to the list of broken SEIS implementations I really hoped that Apple had fixed their not-quite-a-vgic implementation when moving from M1 to M2. Alas, it seems they didn't, and running a buggy EFI version results in the vgic generating SErrors outside of the guest and taking the host down. Apply the same workaround as for M1. Yes, this is all a bit crap. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230103095022.3230946-2-maz@kernel.org --- arch/arm64/include/asm/cputype.h | 4 ++++ arch/arm64/kvm/vgic/vgic-v3.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 4e8b66c74ea2..683ca3af4084 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -124,6 +124,8 @@ #define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 #define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 #define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 +#define APPLE_CPU_PART_M2_BLIZZARD 0x032 +#define APPLE_CPU_PART_M2_AVALANCHE 0x033 #define AMPERE_CPU_PART_AMPERE1 0xAC3 @@ -177,6 +179,8 @@ #define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) #define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX) #define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) +#define MIDR_APPLE_M2_BLIZZARD MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD) +#define MIDR_APPLE_M2_AVALANCHE MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 826ff6f2a4e7..2074521d4a8c 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -616,6 +616,8 @@ static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), {}, }; -- cgit From 36d7546b56a254709a38f1904231e1a93f1c5717 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jan 2023 12:39:33 +0000 Subject: MAINTAINERS: Add Zenghui Yu as a KVM/arm64 reviewer Zenghui has been around for quite some time, and has been instrumental in reviewing the GICv4/4.1 KVM support. I'm delighted that he's agreed to help with the patch review in a more official capacity! Acked-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230103123933.3234865-1-maz@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f61eb221415b..551544d877a3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11359,6 +11359,7 @@ R: James Morse R: Alexandru Elisei R: Suzuki K Poulose R: Oliver Upton +R: Zenghui Yu L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: kvmarm@lists.linux.dev L: kvmarm@lists.cs.columbia.edu (deprecated, moderated for non-subscribers) -- cgit From 7ed906e576a782b8272727f39c68a1762ea3ef98 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 3 Jan 2023 12:07:36 +0000 Subject: MAINTAINERS: Remove myself as a KVM/arm64 reviewer Haven't done any meaningful reviews for more than a year, and it doesn't look like I'll be able to do so in the future. Make it official and remove myself from the KVM/arm64 "Reviewers" list. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230103120736.116523-1-alexandru.elisei@arm.com --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 551544d877a3..6a0fc23455bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11356,7 +11356,6 @@ F: virt/kvm/* KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier R: James Morse -R: Alexandru Elisei R: Suzuki K Poulose R: Oliver Upton R: Zenghui Yu -- cgit From b2b50d572135c5c6e10c2ff79cd828d5a8141ef6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jan 2023 16:37:53 -0800 Subject: block: Remove "select SRCU" Now that the SRCU Kconfig option is unconditionally selected, there is no longer any point in selecting it. Therefore, remove the "select SRCU" Kconfig statements. Signed-off-by: Paul E. McKenney Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/block/Kconfig b/block/Kconfig index 444c5ab3b67e..5d9d9c84d516 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -6,7 +6,6 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y select SBITMAP - select SRCU help Provide block layer support for the kernel. -- cgit From e95d50d74b93a767a026f588e8de0b9718a0105e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 5 Jan 2023 13:23:39 +0200 Subject: lib/scatterlist: Fix to merge contiguous pages into the last SG properly When sg_alloc_append_table_from_pages() calls to pages_are_mergeable() in its 'sgt_append->prv' flow to check whether it can merge contiguous pages into the last SG, it passes the page arguments in the wrong order. The first parameter should be the next candidate page to be merged to the last page and not the opposite. The current code leads to a corrupted SG which resulted in OOPs and unexpected errors when non-contiguous pages are merged wrongly. Fix to pass the page parameters in the right order. Fixes: 1567b49d1a40 ("lib/scatterlist: add check when merging zone device pages") Link: https://lore.kernel.org/r/20230105112339.107969-1-yishaih@nvidia.com Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Logan Gunthorpe Signed-off-by: Jason Gunthorpe --- lib/scatterlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index a0ad2a7959b5..f72aa50c6654 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -476,7 +476,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; last_pg = sg_page(sgt_append->prv); - while (n_pages && pages_are_mergeable(last_pg, pages[0])) { + while (n_pages && pages_are_mergeable(pages[0], last_pg)) { if (sgt_append->prv->length + PAGE_SIZE > max_segment) break; sgt_append->prv->length += PAGE_SIZE; -- cgit From b2d473a6019ef9a54b0156ecdb2e0398c9fa6a24 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 2 Jan 2023 17:07:48 +0100 Subject: riscv, kprobes: Stricter c.jr/c.jalr decoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the compressed instruction extension, c.jr, c.jalr, c.mv, and c.add is encoded the following way (each instruction is 16b): ---+-+-----------+-----------+-- 100 0 rs1[4:0]!=0 00000 10 : c.jr 100 1 rs1[4:0]!=0 00000 10 : c.jalr 100 0 rd[4:0]!=0 rs2[4:0]!=0 10 : c.mv 100 1 rd[4:0]!=0 rs2[4:0]!=0 10 : c.add The following logic is used to decode c.jr and c.jalr: insn & 0xf007 == 0x8002 => instruction is an c.jr insn & 0xf007 == 0x9002 => instruction is an c.jalr When 0xf007 is used to mask the instruction, c.mv can be incorrectly decoded as c.jr, and c.add as c.jalr. Correct the decoding by changing the mask from 0xf007 to 0xf07f. Fixes: c22b0bcb1dd0 ("riscv: Add kprobes supported") Signed-off-by: Björn Töpel Reviewed-by: Conor Dooley Reviewed-by: Guo Ren Link: https://lore.kernel.org/r/20230102160748.1307289-1-bjorn@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/simulate-insn.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h index cb6ff7dccb92..de8474146a9b 100644 --- a/arch/riscv/kernel/probes/simulate-insn.h +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -31,9 +31,9 @@ __RISCV_INSN_FUNCS(fence, 0x7f, 0x0f); } while (0) __RISCV_INSN_FUNCS(c_j, 0xe003, 0xa001); -__RISCV_INSN_FUNCS(c_jr, 0xf007, 0x8002); +__RISCV_INSN_FUNCS(c_jr, 0xf07f, 0x8002); __RISCV_INSN_FUNCS(c_jal, 0xe003, 0x2001); -__RISCV_INSN_FUNCS(c_jalr, 0xf007, 0x9002); +__RISCV_INSN_FUNCS(c_jalr, 0xf07f, 0x9002); __RISCV_INSN_FUNCS(c_beqz, 0xe003, 0xc001); __RISCV_INSN_FUNCS(c_bnez, 0xe003, 0xe001); __RISCV_INSN_FUNCS(c_ebreak, 0xffff, 0x9002); -- cgit From b9b916aee6715cd7f3318af6dc360c4729417b94 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 29 Dec 2022 17:05:45 +0000 Subject: riscv: uaccess: fix type of 0 variable on error in get_user() If the get_user(x, ptr) has x as a pointer, then the setting of (x) = 0 is going to produce the following sparse warning, so fix this by forcing the type of 'x' when access_ok() fails. fs/aio.c:2073:21: warning: Using plain integer as NULL pointer Signed-off-by: Ben Dooks Reviewed-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20221229170545.718264-1-ben-linux@fluff.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 855450bed9f5..ec0cab9fbddd 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -165,7 +165,7 @@ do { \ might_fault(); \ access_ok(__p, sizeof(*__p)) ? \ __get_user((x), __p) : \ - ((x) = 0, -EFAULT); \ + ((x) = (__force __typeof__(x))0, -EFAULT); \ }) #define __put_user_asm(insn, x, ptr, err) \ -- cgit From 1a5a23b9bdf6bde0e5185ca834ff6e806cc2aaaf Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 30 Dec 2022 14:54:27 +0800 Subject: usb: fotg210-udc: fix error return code in fotg210_udc_probe() After commit 5f217ccd520f ("fotg210-udc: Support optional external PHY"), the error code is re-assigned to 0 in fotg210_udc_probe(), if allocate or map memory fails after the assignment, it can't return an error code. Set the error code to -ENOMEM to fix this problem. Fixes: 5f217ccd520f ("fotg210-udc: Support optional external PHY") Signed-off-by: Yang Yingliang Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20221230065427.944586-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/fotg210/fotg210-udc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/fotg210/fotg210-udc.c b/drivers/usb/fotg210/fotg210-udc.c index 66e1b7ee3346..87cca81bf4ac 100644 --- a/drivers/usb/fotg210/fotg210-udc.c +++ b/drivers/usb/fotg210/fotg210-udc.c @@ -1201,6 +1201,8 @@ int fotg210_udc_probe(struct platform_device *pdev) dev_info(dev, "found and initialized PHY\n"); } + ret = -ENOMEM; + for (i = 0; i < FOTG210_MAX_NUM_EP; i++) { fotg210->ep[i] = kzalloc(sizeof(struct fotg210_ep), GFP_KERNEL); if (!fotg210->ep[i]) -- cgit From 83c7423d1eb6806d13c521d1002cc1a012111719 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 21 Dec 2022 17:45:51 +0100 Subject: udf: Fix extension of the last extent in the file When extending the last extent in the file within the last block, we wrongly computed the length of the last extent. This is mostly a cosmetical problem since the extent does not contain any data and the length will be fixed up by following operations but still. Fixes: 1f3868f06855 ("udf: Fix extending file within last block") Signed-off-by: Jan Kara --- fs/udf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d7c2a812fc1..f3e988928d1d 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -595,7 +595,7 @@ static void udf_do_extend_final_block(struct inode *inode, */ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) return; - added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen; + added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; -- cgit From 23970a1c9475b305770fd37bebfec7a10f263787 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 30 Dec 2022 12:53:41 -0500 Subject: udf: initialize newblock to 0 The clang build reports this error fs/udf/inode.c:805:6: error: variable 'newblock' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (*err < 0) ^~~~~~~~ newblock is never set before error handling jump. Initialize newblock to 0 and remove redundant settings. Fixes: d8b39db5fab8 ("udf: Handle error when adding extent to a file") Reported-by: Nathan Chancellor Signed-off-by: Tom Rix Signed-off-by: Jan Kara Message-Id: <20221230175341.1629734-1-trix@redhat.com> --- fs/udf/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f3e988928d1d..34e416327dd4 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -684,7 +684,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; - udf_pblk_t newblocknum, newblock; + udf_pblk_t newblocknum, newblock = 0; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); @@ -787,7 +787,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); if (ret < 0) { *err = ret; - newblock = 0; goto out_free; } c = 0; @@ -852,7 +851,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, goal, err); if (!newblocknum) { *err = -ENOSPC; - newblock = 0; goto out_free; } if (isBeyondEOF) -- cgit From e498a04443240c15c3c857165f7b652b87f4fd96 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Jan 2023 13:17:46 +0100 Subject: usb: dwc3: xilinx: include linux/gpio/consumer.h The newly added gpio consumer calls cause a build failure in configurations that fail to include the right header implicitly: drivers/usb/dwc3/dwc3-xilinx.c: In function 'dwc3_xlnx_init_zynqmp': drivers/usb/dwc3/dwc3-xilinx.c:207:22: error: implicit declaration of function 'devm_gpiod_get_optional'; did you mean 'devm_clk_get_optional'? [-Werror=implicit-function-declaration] 207 | reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); | ^~~~~~~~~~~~~~~~~~~~~~~ | devm_clk_get_optional Fixes: ca05b38252d7 ("usb: dwc3: xilinx: Add gpio-reset support") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230103121755.956027-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index 8607d4c23283..0745e9f11b2e 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit From c4e3ef5685393c5051b52cf1e94b8891d49793ab Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Thu, 8 Dec 2022 16:50:35 -0800 Subject: usb: dwc3: gadget: Ignore End Transfer delay on teardown If we delay sending End Transfer for Setup TRB to be prepared, we need to check if the End Transfer was in preparation for a driver teardown/soft-disconnect. In those cases, just send the End Transfer command without delay. In the case of soft-disconnect, there's a very small chance the command may not go through immediately. But should it happen, the Setup TRB will be prepared during the polling of the controller halted state, allowing the command to go through then. In the case of disabling endpoint due to reconfiguration (e.g. set_interface(alt-setting) or usb reset), then it's driven by the host. Typically the host wouldn't immediately cancel the control request and send another control transfer to trigger the End Transfer command timeout. Fixes: 4db0fbb60136 ("usb: dwc3: gadget: Don't delay End Transfer on delayed_status") Cc: stable@vger.kernel.org Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/f1617a323e190b9cc408fb8b65456e32b5814113.1670546756.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 789976567f9f..89dcfac01235 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1727,6 +1727,7 @@ static int __dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, bool int else if (!ret) dep->flags |= DWC3_EP_END_TRANSFER_PENDING; + dep->flags &= ~DWC3_EP_DELAY_STOP; return ret; } @@ -3732,8 +3733,10 @@ void dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, if (dep->number <= 1 && dwc->ep0state != EP0_DATA_PHASE) return; + if (interrupt && (dep->flags & DWC3_EP_DELAY_STOP)) + return; + if (!(dep->flags & DWC3_EP_TRANSFER_STARTED) || - (dep->flags & DWC3_EP_DELAY_STOP) || (dep->flags & DWC3_EP_END_TRANSFER_PENDING)) return; -- cgit From cb7a95af78d29442b8294683eca4897544b8ef46 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 Jan 2023 11:06:28 -0800 Subject: hfs/hfsplus: avoid WARN_ON() for sanity check, use proper error handling Commit 55d1cbbbb29e ("hfs/hfsplus: use WARN_ON for sanity check") fixed a build warning by turning a comment into a WARN_ON(), but it turns out that syzbot then complains because it can trigger said warning with a corrupted hfs image. The warning actually does warn about a bad situation, but we are much better off just handling it as the error it is. So rather than warn about us doing bad things, stop doing the bad things and return -EIO. While at it, also fix a memory leak that was introduced by an earlier fix for a similar syzbot warning situation, and add a check for one case that historically wasn't handled at all (ie neither comment nor subsequent WARN_ON). Reported-by: syzbot+7bb7cd3595533513a9e7@syzkaller.appspotmail.com Fixes: 55d1cbbbb29e ("hfs/hfsplus: use WARN_ON for sanity check") Fixes: 8d824e69d9f3 ("hfs: fix OOB Read in __hfs_brec_find") Link: https://lore.kernel.org/lkml/000000000000dbce4e05f170f289@google.com/ Tested-by: Michael Schmitz Cc: Arnd Bergmann Cc: Matthew Wilcox Cc: Viacheslav Dubeyko Signed-off-by: Linus Torvalds --- fs/hfs/inode.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9c329a365e75..3a155c1d810e 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -458,15 +458,16 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) /* panic? */ return -EIO; + res = -EIO; if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) - return -EIO; + goto out; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) - /* panic? */ goto out; if (S_ISDIR(main_inode->i_mode)) { - WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir)); + if (fd.entrylength < sizeof(struct hfs_cat_dir)) + goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); if (rec.type != HFS_CDR_DIR || @@ -479,6 +480,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); } else if (HFS_IS_RSRC(inode)) { + if (fd.entrylength < sizeof(struct hfs_cat_file)) + goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); hfs_inode_write_fork(inode, rec.file.RExtRec, @@ -486,7 +489,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } else { - WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file)); + if (fd.entrylength < sizeof(struct hfs_cat_file)) + goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); if (rec.type != HFS_CDR_FIL || @@ -503,9 +507,10 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } + res = 0; out: hfs_find_exit(&fd); - return 0; + return res; } static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, -- cgit From 1382999aa0548a171a272ca817f6c38e797c458c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 6 Jan 2023 04:01:56 +0100 Subject: tpm: Allow system suspend to continue when TPM suspend fails TPM 1 is sometimes broken across system suspends, due to races or locking issues or something else that haven't been diagnosed or fixed yet, most likely having to do with concurrent reads from the TPM's hardware random number generator driver. These issues prevent the system from actually suspending, with errors like: tpm tpm0: A TPM error (28) occurred continue selftest ... tpm tpm0: A TPM error (28) occurred attempting get random ... tpm tpm0: Error (28) sending savestate before suspend tpm_tis 00:08: PM: __pnp_bus_suspend(): tpm_pm_suspend+0x0/0x80 returns 28 tpm_tis 00:08: PM: dpm_run_callback(): pnp_bus_suspend+0x0/0x10 returns 28 tpm_tis 00:08: PM: failed to suspend: error 28 PM: Some devices failed to suspend, or early wake event detected This issue was partially fixed by 23393c646142 ("char: tpm: Protect tpm_pm_suspend with locks"), in a last minute 6.1 commit that Linus took directly because the TPM maintainers weren't available. However, it seems like this just addresses the most common cases of the bug, rather than addressing it entirely. So there are more things to fix still, apparently. In lieu of actually fixing the underlying bug, just allow system suspend to continue, so that laptops still go to sleep fine. Later, this can be reverted when the real bug is fixed. Link: https://lore.kernel.org/lkml/7cbe96cf-e0b5-ba63-d1b4-f63d2e826efa@suse.cz/ Cc: stable@vger.kernel.org # 6.1+ Reported-by: Vlastimil Babka Suggested-by: Linus Torvalds Acked-by: Luigi Semenzato Cc: Peter Huewe Cc: Jarkko Sakkinen Cc: James Bottomley Cc: Johannes Altmanninger Signed-off-by: Jason A. Donenfeld Signed-off-by: Linus Torvalds --- drivers/char/tpm/tpm-interface.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index d69905233aff..7e513b771832 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -412,7 +412,9 @@ int tpm_pm_suspend(struct device *dev) } suspended: - return rc; + if (rc) + dev_err(dev, "Ignoring error %d while suspending\n", rc); + return 0; } EXPORT_SYMBOL_GPL(tpm_pm_suspend); -- cgit From b7bfaa761d760e72a969d116517eaa12e404c262 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 Jan 2023 11:49:43 -0600 Subject: Linux 6.2-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index dfba294ae790..460716314fb3 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit From 74905e3de8adf0e6b5d7f455dcd32cdec13dfb6c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 9 Nov 2022 06:03:03 -0500 Subject: KVM: nSVM: clarify recalc_intercepts() wrt CR8 The mysterious comment "We only want the cr8 intercept bits of L1" dates back to basically the introduction of nested SVM, back when the handling of "less typical" hypervisors was very haphazard. With the development of kvm-unit-tests for interrupt handling, the same code grew another vmcb_clr_intercept for the interrupt window (VINTR) vmexit, this time with a comment that is at least decent. It turns out however that the same comment applies to the CR8 write intercept, which is also a "recheck if an interrupt should be injected" intercept. The CR8 read intercept instead has not been used by KVM for 14 years (commit 649d68643ebf, "KVM: SVM: sync TPR value to V_TPR field in the VMCB"), so do not bother clearing it and let one comment describe both CR8 write and VINTR handling. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bc9cd7086fa9..add65dd59756 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm) c->intercepts[i] = h->intercepts[i]; if (g->int_ctl & V_INTR_MASKING_MASK) { - /* We only want the cr8 intercept bits of L1 */ - vmcb_clr_intercept(c, INTERCEPT_CR8_READ); - vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); - /* - * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not - * affect any interrupt we may want to inject; therefore, - * interrupt window vmexits are irrelevant to L0. + * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8 + * does not affect any interrupt we may want to inject; + * therefore, writes to CR8 are irrelevant to L0, as are + * interrupt window vmexits. */ + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); vmcb_clr_intercept(c, INTERCEPT_VINTR); } -- cgit From 45e966fcca03ecdcccac7cb236e16eea38cc18af Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 22 Oct 2022 04:17:53 -0400 Subject: KVM: x86: Do not return host topology information from KVM_GET_SUPPORTED_CPUID Passing the host topology to the guest is almost certainly wrong and will confuse the scheduler. In addition, several fields of these CPUID leaves vary on each processor; it is simply impossible to return the right values from KVM_GET_SUPPORTED_CPUID in such a way that they can be passed to KVM_SET_CPUID2. The values that will most likely prevent confusion are all zeroes. Userspace will have to override it anyway if it wishes to present a specific topology to the guest. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 14 ++++++++++++++ arch/x86/kvm/cpuid.c | 32 ++++++++++++++++---------------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index deb494f759ed..d8ea37dfddf4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8310,6 +8310,20 @@ CPU[EAX=1]:ECX[24] (TSC_DEADLINE) is not reported by ``KVM_GET_SUPPORTED_CPUID`` It can be enabled if ``KVM_CAP_TSC_DEADLINE_TIMER`` is present and the kernel has enabled in-kernel emulation of the local APIC. +CPU topology +~~~~~~~~~~~~ + +Several CPUID values include topology information for the host CPU: +0x0b and 0x1f for Intel systems, 0x8000001e for AMD systems. Different +versions of KVM return different values for this information and userspace +should not rely on it. Currently they return all zeroes. + +If userspace wishes to set up a guest topology, it should be careful that +the values of these three leaves differ for each CPU. In particular, +the APIC ID is found in EDX for all subleaves of 0x0b and 0x1f, and in EAX +for 0x8000001e; the latter also encodes the core id and node id in bits +7:0 of EBX and ECX respectively. + Obsolete ioctls and capabilities ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b14653b61470..596061c1610e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -770,16 +770,22 @@ struct kvm_cpuid_array { int nent; }; +static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array) +{ + if (array->nent >= array->maxnent) + return NULL; + + return &array->entries[array->nent++]; +} + static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, u32 function, u32 index) { - struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *entry = get_next_cpuid(array); - if (array->nent >= array->maxnent) + if (!entry) return NULL; - entry = &array->entries[array->nent++]; - memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; @@ -956,22 +962,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = edx.full; break; } - /* - * Per Intel's SDM, the 0x1f is a superset of 0xb, - * thus they can be handled by common code. - */ case 0x1f: case 0xb: /* - * Populate entries until the level type (ECX[15:8]) of the - * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is - * the starting entry, filled by the primary do_host_cpuid(). + * No topology; a valid topology is indicated by the presence + * of subleaf 1. */ - for (i = 1; entry->ecx & 0xff00; ++i) { - entry = do_host_cpuid(array, function, i); - if (!entry) - goto out; - } + entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); @@ -1202,6 +1199,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; break; case 0x8000001e: + /* Do not return host topology information. */ + entry->eax = entry->ebx = entry->ecx = 0; + entry->edx = 0; /* reserved */ break; case 0x8000001F: if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { -- cgit From 3a9ae31ac26a58d33008c42f6cd022afc2af2dc0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 9 Jan 2023 06:02:16 -0500 Subject: Documentation: kvm: fix SRCU locking order docs kvm->srcu is taken in KVM_RUN and several other vCPU ioctls, therefore vcpu->mutex is susceptible to the same deadlock that is documented for kvm->slots_lock. The same holds for kvm->lock, since kvm->lock is held outside vcpu->mutex. Fix the documentation and rearrange it to highlight the difference between these locks and kvm->slots_arch_lock, and how kvm->slots_arch_lock can be useful while processing a vmexit. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index a3ca76f9be75..5ee017740d55 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -24,17 +24,18 @@ The acquisition orders for mutexes are as follows: For SRCU: -- ``synchronize_srcu(&kvm->srcu)`` is called _inside_ - the kvm->slots_lock critical section, therefore kvm->slots_lock - cannot be taken inside a kvm->srcu read-side critical section. - Instead, kvm->slots_arch_lock is released before the call - to ``synchronize_srcu()`` and _can_ be taken inside a - kvm->srcu read-side critical section. - -- kvm->lock is taken inside kvm->srcu, therefore - ``synchronize_srcu(&kvm->srcu)`` cannot be called inside - a kvm->lock critical section. If you cannot delay the - call until after kvm->lock is released, use ``call_srcu``. +- ``synchronize_srcu(&kvm->srcu)`` is called inside critical sections + for kvm->lock, vcpu->mutex and kvm->slots_lock. These locks _cannot_ + be taken inside a kvm->srcu read-side critical section; that is, the + following is broken:: + + srcu_read_lock(&kvm->srcu); + mutex_lock(&kvm->slots_lock); + +- kvm->slots_arch_lock instead is released before the call to + ``synchronize_srcu()``. It _can_ therefore be taken inside a + kvm->srcu read-side critical section, for example while processing + a vmexit. On x86: -- cgit From 23e60258aeafb04e5dd813f03cb0c8ab7b01462a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Jan 2023 18:06:48 +0000 Subject: KVM: x86/xen: Fix lockdep warning on "recursive" gpc locking In commit 5ec3289b31 ("KVM: x86/xen: Compatibility fixes for shared runstate area") we declared it safe to obtain two gfn_to_pfn_cache locks at the same time: /* * The guest's runstate_info is split across two pages and we * need to hold and validate both GPCs simultaneously. We can * declare a lock ordering GPC1 > GPC2 because nothing else * takes them more than one at a time. */ However, we forgot to tell lockdep. Do so, by setting a subclass on the first lock before taking the second. Fixes: 5ec3289b31 ("KVM: x86/xen: Compatibility fixes for shared runstate area") Suggested-by: Peter Zijlstra Signed-off-by: David Woodhouse Message-Id: <20230111180651.14394-1-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2e29bdc2949c..bfa9809721b5 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -304,8 +304,10 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * The guest's runstate_info is split across two pages and we * need to hold and validate both GPCs simultaneously. We can * declare a lock ordering GPC1 > GPC2 because nothing else - * takes them more than one at a time. + * takes them more than one at a time. Set a subclass on the + * gpc1 lock to make lockdep shut up about it. */ + lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); read_lock(&gpc2->lock); if (!kvm_gpc_check(gpc2, user_len2)) { -- cgit From bbe17c625d6843e9cdf14d81fbece1b0f0c3fb2f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Jan 2023 18:06:49 +0000 Subject: KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest() The kvm_xen_update_runstate_guest() function can be called when the vCPU is being scheduled out, from a preempt notifier. It *opportunistically* updates the runstate area in the guest memory, if the gfn_to_pfn_cache which caches the appropriate address is still valid. If there is *contention* when it attempts to obtain gpc->lock, then locking inside the priority inheritance checks may cause a deadlock. Lockdep reports: [13890.148997] Chain exists of: &gpc->lock --> &p->pi_lock --> &rq->__lock [13890.149002] Possible unsafe locking scenario: [13890.149003] CPU0 CPU1 [13890.149004] ---- ---- [13890.149005] lock(&rq->__lock); [13890.149007] lock(&p->pi_lock); [13890.149009] lock(&rq->__lock); [13890.149011] lock(&gpc->lock); [13890.149013] *** DEADLOCK *** In the general case, if there's contention for a read lock on gpc->lock, that's going to be because something else is either invalidating or revalidating the cache. Either way, we've raced with seeing it in an invalid state, in which case we would have aborted the opportunistic update anyway. So in the 'atomic' case when called from the preempt notifier, just switch to using read_trylock() and avoid the PI handling altogether. Signed-off-by: David Woodhouse Message-Id: <20230111180651.14394-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bfa9809721b5..651f9c5b873d 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -271,7 +271,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * Attempt to obtain the GPC lock on *both* (if there are two) * gfn_to_pfn caches that cover the region. */ - read_lock_irqsave(&gpc1->lock, flags); + if (atomic) { + local_irq_save(flags); + if (!read_trylock(&gpc1->lock)) { + local_irq_restore(flags); + return; + } + } else { + read_lock_irqsave(&gpc1->lock, flags); + } while (!kvm_gpc_check(gpc1, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); @@ -308,7 +316,14 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * gpc1 lock to make lockdep shut up about it. */ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); - read_lock(&gpc2->lock); + if (atomic) { + if (!read_trylock(&gpc2->lock)) { + read_unlock_irqrestore(&gpc1->lock, flags); + return; + } + } else { + read_lock(&gpc2->lock); + } if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); -- cgit From 42a90008f890afc41837dfeec1f0b1e7bcecf94a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Jan 2023 18:06:50 +0000 Subject: KVM: Ensure lockdep knows about kvm->lock vs. vcpu->mutex ordering rule Documentation/virt/kvm/locking.rst tells us that kvm->lock is taken outside vcpu->mutex. But that doesn't actually happen very often; it's only in some esoteric cases like migration with AMD SEV. This means that lockdep usually doesn't notice, and doesn't do its job of keeping us honest. Ensure that lockdep *always* knows about the ordering of these two locks, by briefly taking vcpu->mutex in kvm_vm_ioctl_create_vcpu() while kvm->lock is held. Signed-off-by: David Woodhouse Message-Id: <20230111180651.14394-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 13e88297f999..9c60384b5ae0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3954,6 +3954,13 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) } mutex_lock(&kvm->lock); + +#ifdef CONFIG_LOCKDEP + /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */ + mutex_lock(&vcpu->mutex); + mutex_unlock(&vcpu->mutex); +#endif + if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; -- cgit From 310bc39546a435c83cc27a0eba878afac0d74714 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Jan 2023 18:06:51 +0000 Subject: KVM: x86/xen: Avoid deadlock by adding kvm->arch.xen.xen_lock leaf node lock In commit 14243b387137a ("KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery") the clever version of me left some helpful notes for those who would come after him: /* * For the irqfd workqueue, using the main kvm->lock mutex is * fine since this function is invoked from kvm_set_irq() with * no other lock held, no srcu. In future if it will be called * directly from a vCPU thread (e.g. on hypercall for an IPI) * then it may need to switch to using a leaf-node mutex for * serializing the shared_info mapping. */ mutex_lock(&kvm->lock); In commit 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") the other version of me ran straight past that comment without reading it, and introduced a potential deadlock by taking vcpu->mutex and kvm->lock in the wrong order. Solve this as originally suggested, by adding a leaf-node lock in the Xen state rather than using kvm->lock for it. Fixes: 2fd6df2f2b47 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") Signed-off-by: David Woodhouse Message-Id: <20230111180651.14394-4-dwmw2@infradead.org> [Rebase, add docs. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 2 +- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/xen.c | 67 +++++++++++++++++--------------------- 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 5ee017740d55..a0146793d197 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -39,7 +39,7 @@ For SRCU: On x86: -- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock +- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock and kvm->arch.xen.xen_lock - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f35f1ff4427b..6aaae18f1854 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1111,6 +1111,7 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + struct mutex xen_lock; u32 xen_version; bool long_mode; bool runstate_update_flag; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 651f9c5b873d..8fd41f5deae3 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -607,26 +607,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.upcall_vector = data->u.vector; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; @@ -636,9 +636,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.xen_version = data->u.xen_version; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -647,9 +647,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = -EOPNOTSUPP; break; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -664,7 +664,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: @@ -703,7 +703,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return r; } @@ -711,7 +711,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); switch (data->type) { @@ -939,7 +939,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) } srcu_read_unlock(&vcpu->kvm->srcu, idx); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -947,7 +947,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: @@ -1030,7 +1030,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -1123,7 +1123,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); if (xhc->msr && !kvm->arch.xen_hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); @@ -1132,7 +1132,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return 0; } @@ -1675,15 +1675,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) mm_borrowed = true; } - /* - * For the irqfd workqueue, using the main kvm->lock mutex is - * fine since this function is invoked from kvm_set_irq() with - * no other lock held, no srcu. In future if it will be called - * directly from a vCPU thread (e.g. on hypercall for an IPI) - * then it may need to switch to using a leaf-node mutex for - * serializing the shared_info mapping. - */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * It is theoretically possible for the page to be unmapped @@ -1712,7 +1704,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } while(!rc); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (mm_borrowed) kthread_unuse_mm(kvm->mm); @@ -1828,7 +1820,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, int ret; /* Protect writes to evtchnfd as well as the idr lookup. */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); ret = -ENOENT; @@ -1859,7 +1851,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, } ret = 0; out_unlock: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return ret; } @@ -1922,10 +1914,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, GFP_KERNEL); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (ret >= 0) return 0; @@ -1943,9 +1935,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) { struct evtchnfd *evtchnfd; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (!evtchnfd) return -ENOENT; @@ -1963,7 +1955,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) int i; int n = 0; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * Because synchronize_srcu() cannot be called inside the @@ -1975,7 +1967,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); if (!all_evtchnfds) { - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return -ENOMEM; } @@ -1984,7 +1976,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); synchronize_srcu(&kvm->srcu); @@ -2086,6 +2078,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { + mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } -- cgit