From c1aca3080e382886e2e58e809787441984a2f89b Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:13 -0800 Subject: xfrm: Check if_id in xfrm_migrate This patch enables distinguishing SAs and SPs based on if_id during the xfrm_migrate flow. This ensures support for xfrm interfaces throughout the SA/SP lifecycle. When there are multiple existing SPs with the same direction, the same xfrm_selector and different endpoint addresses, xfrm_migrate might fail with ENODATA. Specifically, the code path for performing xfrm_migrate is: Stage 1: find policy to migrate with xfrm_migrate_policy_find(sel, dir, type, net) Stage 2: find and update state(s) with xfrm_migrate_state_find(mp, net) Stage 3: update endpoint address(es) of template(s) with xfrm_policy_migrate(pol, m, num_migrate) Currently "Stage 1" always returns the first xfrm_policy that matches, and "Stage 3" looks for the xfrm_tmpl that matches the old endpoint address. Thus if there are multiple xfrm_policy with same selector, direction, type and net, "Stage 1" might rertun a wrong xfrm_policy and "Stage 3" will fail with ENODATA because it cannot find a xfrm_tmpl with the matching endpoint address. The fix is to allow userspace to pass an if_id and add if_id to the matching rule in Stage 1 and Stage 2 since if_id is a unique ID for xfrm_policy and xfrm_state. For compatibility, if_id will only be checked if the attribute is set. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1668886 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..743dd1da506e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,14 +1681,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); -- cgit From a6d95c5a628a09be129f25d5663a7e9db8261f51 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 743dd1da506e..76aa6f11a540 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); -- cgit From 4f774c4a65bf3987d1a95c966e884f38c8a942af Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 27 Jan 2022 19:25:53 -0800 Subject: cpufreq: Reintroduce ready() callback This effectively revert '4bf8e582119e ("cpufreq: Remove ready() callback")', in order to reintroduce the ready callback. This is needed in order to be able to leave the thermal pressure interrupts in the Qualcomm CPUfreq driver disabled during initialization, so that it doesn't fire while related_cpus are still 0. Signed-off-by: Bjorn Andersson [ Viresh: Added the Chinese translation as well and updated commit msg ] Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1ab29e61b078..3522a272b74d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -382,6 +382,9 @@ struct cpufreq_driver { int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); + /* Will be called after the driver is fully initialized */ + void (*ready)(struct cpufreq_policy *policy); + struct freq_attr **attr; /* platform specific boost support code */ -- cgit From 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Feb 2022 16:14:32 +0200 Subject: xfrm: enforce validity of offload input flags struct xfrm_user_offload has flags variable that received user input, but kernel didn't check if valid bits were provided. It caused a situation where not sanitized input was forwarded directly to the drivers. For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by strongswan, but not implemented in the kernel at all. As a solution, check and sanitize input flags to forward XFRM_OFFLOAD_INBOUND to the drivers. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4e29d7851890..65e13a099b1a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -511,6 +511,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 -- cgit From a8abb0c3dc1e28454851a00f8b7333d9695d566c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 9 Feb 2022 12:33:23 +0530 Subject: bpf: Fix crash due to incorrect copy_map_value When both bpf_spin_lock and bpf_timer are present in a BPF map value, copy_map_value needs to skirt both objects when copying a value into and out of the map. However, the current code does not set both s_off and t_off in copy_map_value, which leads to a crash when e.g. bpf_spin_lock is placed in map value with bpf_timer, as bpf_map_update_elem call will be able to overwrite the other timer object. When the issue is not fixed, an overwriting can produce the following splat: [root@(none) bpf]# ./test_progs -t timer_crash [ 15.930339] bpf_testmod: loading out-of-tree module taints kernel. [ 16.037849] ================================================================== [ 16.038458] BUG: KASAN: user-memory-access in __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.038944] Write of size 8 at addr 0000000000043ec0 by task test_progs/325 [ 16.039399] [ 16.039514] CPU: 0 PID: 325 Comm: test_progs Tainted: G OE 5.16.0+ #278 [ 16.039983] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.15.0-1 04/01/2014 [ 16.040485] Call Trace: [ 16.040645] [ 16.040805] dump_stack_lvl+0x59/0x73 [ 16.041069] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.041427] kasan_report.cold+0x116/0x11b [ 16.041673] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042040] __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042328] ? memcpy+0x39/0x60 [ 16.042552] ? pv_hash+0xd0/0xd0 [ 16.042785] ? lockdep_hardirqs_off+0x95/0xd0 [ 16.043079] __bpf_spin_lock_irqsave+0xdf/0xf0 [ 16.043366] ? bpf_get_current_comm+0x50/0x50 [ 16.043608] ? jhash+0x11a/0x270 [ 16.043848] bpf_timer_cancel+0x34/0xe0 [ 16.044119] bpf_prog_c4ea1c0f7449940d_sys_enter+0x7c/0x81 [ 16.044500] bpf_trampoline_6442477838_0+0x36/0x1000 [ 16.044836] __x64_sys_nanosleep+0x5/0x140 [ 16.045119] do_syscall_64+0x59/0x80 [ 16.045377] ? lock_is_held_type+0xe4/0x140 [ 16.045670] ? irqentry_exit_to_user_mode+0xa/0x40 [ 16.046001] ? mark_held_locks+0x24/0x90 [ 16.046287] ? asm_exc_page_fault+0x1e/0x30 [ 16.046569] ? asm_exc_page_fault+0x8/0x30 [ 16.046851] ? lockdep_hardirqs_on+0x7e/0x100 [ 16.047137] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 16.047405] RIP: 0033:0x7f9e4831718d [ 16.047602] Code: b4 0c 00 0f 05 eb a9 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b3 6c 0c 00 f7 d8 64 89 01 48 [ 16.048764] RSP: 002b:00007fff488086b8 EFLAGS: 00000206 ORIG_RAX: 0000000000000023 [ 16.049275] RAX: ffffffffffffffda RBX: 00007f9e48683740 RCX: 00007f9e4831718d [ 16.049747] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007fff488086d0 [ 16.050225] RBP: 00007fff488086f0 R08: 00007fff488085d7 R09: 00007f9e4cb594a0 [ 16.050648] R10: 0000000000000000 R11: 0000000000000206 R12: 00007f9e484cde30 [ 16.051124] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 16.051608] [ 16.051762] ================================================================== Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..31a83449808b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -224,7 +224,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) if (unlikely(map_value_has_spin_lock(map))) { s_off = map->spin_lock_off; s_sz = sizeof(struct bpf_spin_lock); - } else if (unlikely(map_value_has_timer(map))) { + } + if (unlikely(map_value_has_timer(map))) { t_off = map->timer_off; t_sz = sizeof(struct bpf_timer); } -- cgit From 5eaed6eedbe9612f642ad2b880f961d1c6c8ec2b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 11 Feb 2022 11:49:53 -0800 Subject: bpf: Fix a bpf_timer initialization issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch in [1] intends to fix a bpf_timer related issue, but the fix caused existing 'timer' selftest to fail with hang or some random errors. After some debug, I found an issue with check_and_init_map_value() in the hashtab.c. More specifically, in hashtab.c, we have code l_new = bpf_map_kmalloc_node(&htab->map, ...) check_and_init_map_value(&htab->map, l_new...) Note that bpf_map_kmalloc_node() does not do initialization so l_new contains random value. The function check_and_init_map_value() intends to zero the bpf_spin_lock and bpf_timer if they exist in the map. But I found bpf_spin_lock is zero'ed but bpf_timer is not zero'ed. With [1], later copy_map_value() skips copying of bpf_spin_lock and bpf_timer. The non-zero bpf_timer caused random failures for 'timer' selftest. Without [1], for both bpf_spin_lock and bpf_timer case, bpf_timer will be zero'ed, so 'timer' self test is okay. For check_and_init_map_value(), why bpf_spin_lock is zero'ed properly while bpf_timer not. In bpf uapi header, we have struct bpf_spin_lock { __u32 val; }; struct bpf_timer { __u64 :64; __u64 :64; } __attribute__((aligned(8))); The initialization code: *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = (struct bpf_spin_lock){}; *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; It appears the compiler has no obligation to initialize anonymous fields. For example, let us use clang with bpf target as below: $ cat t.c struct bpf_timer { unsigned long long :64; }; struct bpf_timer2 { unsigned long long a; }; void test(struct bpf_timer *t) { *t = (struct bpf_timer){}; } void test2(struct bpf_timer2 *t) { *t = (struct bpf_timer2){}; } $ clang -target bpf -O2 -c -g t.c $ llvm-objdump -d t.o ... 0000000000000000 : 0: 95 00 00 00 00 00 00 00 exit 0000000000000008 : 1: b7 02 00 00 00 00 00 00 r2 = 0 2: 7b 21 00 00 00 00 00 00 *(u64 *)(r1 + 0) = r2 3: 95 00 00 00 00 00 00 00 exit gcc11.2 does not have the above issue. But from INTERNATIONAL STANDARD ©ISO/IEC ISO/IEC 9899:201x Programming languages — C http://www.open-std.org/Jtc1/sc22/wg14/www/docs/n1547.pdf page 157: Except where explicitly stated otherwise, for the purposes of this subclause unnamed members of objects of structure and union type do not participate in initialization. Unnamed members of structure objects have indeterminate value even after initialization. To fix the problem, let use memset for bpf_timer case in check_and_init_map_value(). For consistency, memset is also used for bpf_spin_lock case. [1] https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com/ Fixes: 68134668c17f3 ("bpf: Add map side support for bpf timers.") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211194953.3142152-1-yhs@fb.com --- include/linux/bpf.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 31a83449808b..d0ad379d1e62 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,11 +209,9 @@ static inline bool map_value_has_timer(const struct bpf_map *map) static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) - *(struct bpf_timer *)(dst + map->timer_off) = - (struct bpf_timer){}; + memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -- cgit From ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 11 Feb 2022 02:12:52 +0100 Subject: swiotlb: fix info leak with DMA_FROM_DEVICE The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a -- cgit From ba2689234be92024e5635d30fe744f4853ad97db Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 13:59:46 +0000 Subject: arm64: entry: Add vectors that have the bhb mitigation sequences Some CPUs affected by Spectre-BHB need a sequence of branches, or a firmware call to be run before any indirect branch. This needs to go in the vectors. No CPU needs both. While this can be patched in, it would run on all CPUs as there is a single set of vectors. If only one part of a big/little combination is affected, the unaffected CPUs have to run the mitigation too. Create extra vectors that include the sequence. Subsequent patches will allow affected CPUs to select this set of vectors. Later patches will modify the loop count to match what the CPU requires. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- include/linux/arm-smccc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 63ccb5252190..220c8c60e021 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -92,6 +92,11 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_ARCH_WORKAROUND_3 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, 0x3fff) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ -- cgit From 7a5428dcb7902700b830e912feee4e845df7c019 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Feb 2022 08:52:31 +0100 Subject: block: fix surprise removal for drivers calling blk_set_queue_dying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various block drivers call blk_set_queue_dying to mark a disk as dead due to surprise removal events, but since commit 8e141f9eb803 that doesn't work given that the GD_DEAD flag needs to be set to stop I/O. Replace the driver calls to blk_set_queue_dying with a new (and properly documented) blk_mark_disk_dead API, and fold blk_set_queue_dying into the only remaining caller. Fixes: 8e141f9eb803 ("block: drain file system I/O on del_gendisk") Reported-by: Markus Blöchl Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220217075231.1140-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f35aea98bc35..16b47035e4b0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -748,7 +748,8 @@ extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); -extern void blk_set_queue_dying(struct request_queue *); + +void blk_mark_disk_dead(struct gendisk *disk); #ifdef CONFIG_BLOCK /* -- cgit From a1cdec57e03a1352e92fbbe7974039dda4efcec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 09:05:02 -0800 Subject: net-timestamp: convert sk->sk_tskey to atomic_t UDP sendmsg() can be lockless, this is causing all kinds of data races. This patch converts sk->sk_tskey to remove one of these races. BUG: KCSAN: data-race in __ip_append_data / __ip_append_data read to 0xffff8881035d4b6c of 4 bytes by task 8877 on cpu 1: __ip_append_data+0x1c1/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881035d4b6c of 4 bytes by task 8880 on cpu 0: __ip_append_data+0x1d8/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000054d -> 0x0000054e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8880 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00167-gdcb85f85fa6f-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..50aecd28b355 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,7 +507,7 @@ struct sock { #endif u16 sk_tsflags; u8 sk_shutdown; - u32 sk_tskey; + atomic_t sk_tskey; atomic_t sk_zckey; u8 sk_clockid; @@ -2667,7 +2667,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; -- cgit From 988f0a9045b0058a43ccee764a671dfab81e6d15 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:52 +0200 Subject: soc: fsl: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- include/soc/fsl/dpaa2-fd.h | 3 ++- include/soc/fsl/qe/immap_qe.h | 3 ++- include/soc/fsl/qe/qe_tdm.h | 4 +++- include/soc/fsl/qe/ucc_fast.h | 2 +- include/soc/fsl/qe/ucc_slow.h | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/soc/fsl/dpaa2-fd.h b/include/soc/fsl/dpaa2-fd.h index 90ae8d191f1a..bae490cac0aa 100644 --- a/include/soc/fsl/dpaa2-fd.h +++ b/include/soc/fsl/dpaa2-fd.h @@ -7,7 +7,8 @@ #ifndef __FSL_DPAA2_FD_H #define __FSL_DPAA2_FD_H -#include +#include +#include /** * DOC: DPAA2 FD - Frame Descriptor APIs for DPAA2 diff --git a/include/soc/fsl/qe/immap_qe.h b/include/soc/fsl/qe/immap_qe.h index 7614fee532f1..edd601f53f5d 100644 --- a/include/soc/fsl/qe/immap_qe.h +++ b/include/soc/fsl/qe/immap_qe.h @@ -13,7 +13,8 @@ #define _ASM_POWERPC_IMMAP_QE_H #ifdef __KERNEL__ -#include +#include + #include #define QE_IMMAP_SIZE (1024 * 1024) /* 1MB from 1MB+IMMR */ diff --git a/include/soc/fsl/qe/qe_tdm.h b/include/soc/fsl/qe/qe_tdm.h index b6febe225071..43ea830cfe1f 100644 --- a/include/soc/fsl/qe/qe_tdm.h +++ b/include/soc/fsl/qe/qe_tdm.h @@ -10,8 +10,8 @@ #ifndef _QE_TDM_H_ #define _QE_TDM_H_ -#include #include +#include #include #include @@ -19,6 +19,8 @@ #include #include +struct device_node; + /* SI RAM entries */ #define SIR_LAST 0x0001 #define SIR_BYTE 0x0002 diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h index 9696a5b9b5d1..ad60b87a3c69 100644 --- a/include/soc/fsl/qe/ucc_fast.h +++ b/include/soc/fsl/qe/ucc_fast.h @@ -10,7 +10,7 @@ #ifndef __UCC_FAST_H__ #define __UCC_FAST_H__ -#include +#include #include #include diff --git a/include/soc/fsl/qe/ucc_slow.h b/include/soc/fsl/qe/ucc_slow.h index 11a216e4e919..7548ce8a202d 100644 --- a/include/soc/fsl/qe/ucc_slow.h +++ b/include/soc/fsl/qe/ucc_slow.h @@ -11,7 +11,7 @@ #ifndef __UCC_SLOW_H__ #define __UCC_SLOW_H__ -#include +#include #include #include -- cgit From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: sched: Fix yet more sched_fork() races Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net --- include/linux/sched/task.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index b9198a1b3a84..e84e54d1b490 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); -- cgit From 5486f5bf790b5c664913076c3194b8f916a5c7ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Feb 2022 14:35:49 +0100 Subject: net: Force inlining of checksum functions in net/checksum.h All functions defined as static inline in net/checksum.h are meant to be inlined for performance reason. But since commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") the compiler is allowed to uninline functions when it wants. Fair enough in the general case, but for tiny performance critical checksum helpers that's counter-productive. The problem mainly arises when selecting CONFIG_CC_OPTIMISE_FOR_SIZE, Those helpers being 'static inline' in header files you suddenly find them duplicated many times in the resulting vmlinux. Here is a typical exemple when building powerpc pmac32_defconfig with CONFIG_CC_OPTIMISE_FOR_SIZE. csum_sub() appears 4 times: c04a23cc : c04a23cc: 7c 84 20 f8 not r4,r4 c04a23d0: 7c 63 20 14 addc r3,r3,r4 c04a23d4: 7c 63 01 94 addze r3,r3 c04a23d8: 4e 80 00 20 blr ... c04a2ce8: 4b ff f6 e5 bl c04a23cc ... c04a2d2c: 4b ff f6 a1 bl c04a23cc ... c04a2d54: 4b ff f6 79 bl c04a23cc ... c04a754c : c04a754c: 7c 84 20 f8 not r4,r4 c04a7550: 7c 63 20 14 addc r3,r3,r4 c04a7554: 7c 63 01 94 addze r3,r3 c04a7558: 4e 80 00 20 blr ... c04ac930: 4b ff ac 1d bl c04a754c ... c04ad264: 4b ff a2 e9 bl c04a754c ... c04e3b08 : c04e3b08: 7c 84 20 f8 not r4,r4 c04e3b0c: 7c 63 20 14 addc r3,r3,r4 c04e3b10: 7c 63 01 94 addze r3,r3 c04e3b14: 4e 80 00 20 blr ... c04e5788: 4b ff e3 81 bl c04e3b08 ... c04e65c8: 4b ff d5 41 bl c04e3b08 ... c0512d34 : c0512d34: 7c 84 20 f8 not r4,r4 c0512d38: 7c 63 20 14 addc r3,r3,r4 c0512d3c: 7c 63 01 94 addze r3,r3 c0512d40: 4e 80 00 20 blr ... c0512dfc: 4b ff ff 39 bl c0512d34 ... c05138bc: 4b ff f4 79 bl c0512d34 ... Restore the expected behaviour by using __always_inline for all functions defined in net/checksum.h vmlinux size is even reduced by 256 bytes with this patch: text data bss dec hex filename 6980022 2515362 194384 9689768 93daa8 vmlinux.before 6979862 2515266 194384 9689512 93d9a8 vmlinux.now Fixes: ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Andrew Morton Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- include/net/checksum.h | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 5218041e5c8f..02d0c2d01014 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -22,7 +22,7 @@ #include #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER -static inline +static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { @@ -33,7 +33,7 @@ __wsum csum_and_copy_from_user (const void __user *src, void *dst, #endif #ifndef HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user +static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); @@ -45,7 +45,7 @@ static __inline__ __wsum csum_and_copy_to_user #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY -static inline __wsum +static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); @@ -54,7 +54,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len) #endif #ifndef HAVE_ARCH_CSUM_ADD -static inline __wsum csum_add(__wsum csum, __wsum addend) +static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; @@ -62,12 +62,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend) } #endif -static inline __wsum csum_sub(__wsum csum, __wsum addend) +static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; @@ -75,12 +75,12 @@ static inline __sum16 csum16_add(__sum16 csum, __be16 addend) return (__force __sum16)(res + (res < (__force u16)addend)); } -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } -static inline __wsum csum_shift(__wsum sum, int offset) +static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) @@ -88,42 +88,43 @@ static inline __wsum csum_shift(__wsum sum, int offset) return sum; } -static inline __wsum +static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } -static inline __wsum +static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } -static inline __wsum +static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } -static inline __wsum csum_unfold(__sum16 n) +static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } -static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) +static __always_inline +__wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) -static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) +static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); @@ -136,7 +137,7 @@ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) * m : old value of a 16bit field * m' : new value of a 16bit field */ -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) +static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } @@ -150,16 +151,16 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); -static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, - __be16 from, __be16 to, - bool pseudohdr) +static __always_inline +void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, + __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } -static inline __wsum remcsum_adjust(void *ptr, __wsum csum, - int start, int offset) +static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, + int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; @@ -175,12 +176,12 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, return delta; } -static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) +static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } -static inline __wsum wsum_negate(__wsum val) +static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } -- cgit From b1a5983f56e371046dcf164f90bfaf704d2b89f6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Feb 2022 23:41:20 +0100 Subject: netfilter: nf_tables_offload: incorrect flow offload action array size immediate verdict expression needs to allocate one slot in the flow offload action array, however, immediate data expression does not need to do so. fwd and dup expression need to allocate one slot, this is missing. Add a new offload_action interface to report if this expression needs to allocate one slot in the flow offload action array. Fixes: be2861dc36d7 ("netfilter: nft_{fwd,dup}_netdev: add offload support") Reported-and-tested-by: Nick Gregory Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_offload.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eaf55da9a205..c4c0861deac1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -905,9 +905,9 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); + bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); - u32 offload_flags; const struct nft_expr_type *type; void *data; }; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index f9d95ff82df8..797147843958 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,8 +67,6 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) - void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); -- cgit From 44a3918c8245ab10c6c9719dd12e7a8d291980d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..1f56806d8eb9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,11 @@ struct bpf_core_ctx { int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2012,6 +2017,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, -- cgit From 93dd04ab0b2b32ae6e70284afc764c577156658e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 18 Feb 2022 14:13:58 +0100 Subject: slab: remove __alloc_size attribute from __kmalloc_track_caller Commit c37495d6254c ("slab: add __alloc_size attributes for better bounds checking") added __alloc_size attributes to a bunch of kmalloc function prototypes. Unfortunately the change to __kmalloc_track_caller seems to cause clang to generate broken code and the first time this is called when booting, the box will crash. While the compiler problems are being reworked and attempted to be solved [1], let's just drop the attribute to solve the issue now. Once it is resolved it can be added back. [1] https://github.com/ClangBuiltLinux/linux/issues/1599 Fixes: c37495d6254c ("slab: add __alloc_size attributes for better bounds checking") Cc: stable Cc: Kees Cook Cc: Daniel Micay Cc: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Andrew Morton Cc: Vlastimil Babka Cc: Nathan Chancellor Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Cc: llvm@lists.linux.dev Signed-off-by: Greg Kroah-Hartman Acked-by: Nick Desaulniers Acked-by: David Rientjes Acked-by: Kees Cook Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20220218131358.3032912-1-gregkh@linuxfoundation.org --- include/linux/slab.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 37bde99b74af..5b6193fd8bd9 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -660,8 +660,7 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) - __alloc_size(1); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -- cgit From c086df4902573e2f06c6a2a83452c13a8bc603f5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:52:52 -0500 Subject: fuse: move FUSE_SUPER_MAGIC definition to magic.h ...to help userland apps that need to identify FUSE mounts. Signed-off-by: Jeff Layton Signed-off-by: Miklos Szeredi --- include/uapi/linux/magic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 0425cd79af9a..f724129c0425 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -36,6 +36,7 @@ #define EFIVARFS_MAGIC 0xde5e81e4 #define HOSTFS_SUPER_MAGIC 0x00c0ffee #define OVERLAYFS_SUPER_MAGIC 0x794c7630 +#define FUSE_SUPER_MAGIC 0x65735546 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ -- cgit From f6c052afe6f802d87c74153b7a57c43b2e9faf07 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Sun, 20 Feb 2022 15:14:31 +0000 Subject: nvmem: core: Fix a conflict between MTD and NVMEM on wp-gpios property Wp-gpios property can be used on NVMEM nodes and the same property can be also used on MTD NAND nodes. In case of the wp-gpios property is defined at NAND level node, the GPIO management is done at NAND driver level. Write protect is disabled when the driver is probed or resumed and is enabled when the driver is released or suspended. When no partitions are defined in the NAND DT node, then the NAND DT node will be passed to NVMEM framework. If wp-gpios property is defined in this node, the GPIO resource is taken twice and the NAND controller driver fails to probe. It would be possible to set config->wp_gpio at MTD level before calling nvmem_register function but NVMEM framework will toggle this GPIO on each write when this GPIO should only be controlled at NAND level driver to ensure that the Write Protect has not been enabled. A way to fix this conflict is to add a new boolean flag in nvmem_config named ignore_wp. In case ignore_wp is set, the GPIO resource will be managed by the provider. Fixes: 2a127da461a9 ("nvmem: add support for the write-protect pin") Cc: stable@vger.kernel.org Signed-off-by: Christophe Kerello Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220220151432.16605-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 98efb7b5660d..c9a3ac9efeaa 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -70,7 +70,8 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin + * @wp-gpio: Write protect pin + * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is @@ -92,6 +93,7 @@ struct nvmem_config { enum nvmem_type type; bool read_only; bool root_only; + bool ignore_wp; struct device_node *of_node; bool no_of_node; nvmem_reg_read_t reg_read; -- cgit From 93b71801a8274cd9511557faf04365a5de487197 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Feb 2022 09:06:54 -0500 Subject: KVM: PPC: reserve capability 210 for KVM_CAP_PPC_AIL_MODE_3 Add KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL resource mode to 3 with the H_SET_MODE hypercall. This capability differs between processor types and KVM types (PR, HV, Nested HV), and affects guest-visible behaviour. QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and use the KVM CAP if available to determine KVM support[2]. Reviewed-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5191b57e1562..507ee1f2aa96 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING -- cgit From c2700d2886a87f83f31e0a301de1d2350b52c79b Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Sat, 22 Jan 2022 22:27:44 +0530 Subject: nvme-tcp: send H2CData PDUs based on MAXH2CDATA As per NVMe/TCP specification (revision 1.0a, section 3.6.2.3) Maximum Host to Controller Data length (MAXH2CDATA): Specifies the maximum number of PDU-Data bytes per H2CData PDU in bytes. This value is a multiple of dwords and should be no less than 4,096. Current code sets H2CData PDU data_length to r2t_length, it does not check MAXH2CDATA value. Fix this by setting H2CData PDU data_length to min(req->h2cdata_left, queue->maxh2cdata). Also validate MAXH2CDATA value returned by target in ICResp PDU, if it is not a multiple of dword or if it is less than 4096 return -EINVAL from nvme_tcp_init_connection(). Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/nvme-tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index 959e0bd9a913..75470159a194 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -12,6 +12,7 @@ #define NVME_TCP_DISC_PORT 8009 #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 +#define NVME_TCP_MIN_MAXH2CDATA 4096 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, -- cgit From d9b5ae5c1b241b91480aa30408be12fe91af834a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 23 Feb 2022 18:34:16 +0200 Subject: openvswitch: Fix setting ipv6 fields causing hw csum failure Ipv6 ttl, label and tos fields are modified without first pulling/pushing the ipv6 header, which would have updated the hw csum (if available). This might cause csum validation when sending the packet to the stack, as can be seen in the trace below. Fix this by updating skb->csum if available. Trace resulted by ipv6 ttl dec and then sending packet to conntrack [actions: set(ipv6(hlimit=63)),ct(zone=99)]: [295241.900063] s_pf0vf2: hw csum failure [295241.923191] Call Trace: [295241.925728] [295241.927836] dump_stack+0x5c/0x80 [295241.931240] __skb_checksum_complete+0xac/0xc0 [295241.935778] nf_conntrack_tcp_packet+0x398/0xba0 [nf_conntrack] [295241.953030] nf_conntrack_in+0x498/0x5e0 [nf_conntrack] [295241.958344] __ovs_ct_lookup+0xac/0x860 [openvswitch] [295241.968532] ovs_ct_execute+0x4a7/0x7c0 [openvswitch] [295241.979167] do_execute_actions+0x54a/0xaa0 [openvswitch] [295242.001482] ovs_execute_actions+0x48/0x100 [openvswitch] [295242.006966] ovs_dp_process_packet+0x96/0x1d0 [openvswitch] [295242.012626] ovs_vport_receive+0x6c/0xc0 [openvswitch] [295242.028763] netdev_frame_hook+0xc0/0x180 [openvswitch] [295242.034074] __netif_receive_skb_core+0x2ca/0xcb0 [295242.047498] netif_receive_skb_internal+0x3e/0xc0 [295242.052291] napi_gro_receive+0xba/0xe0 [295242.056231] mlx5e_handle_rx_cqe_mpwrq_rep+0x12b/0x250 [mlx5_core] [295242.062513] mlx5e_poll_rx_cq+0xa0f/0xa30 [mlx5_core] [295242.067669] mlx5e_napi_poll+0xe1/0x6b0 [mlx5_core] [295242.077958] net_rx_action+0x149/0x3b0 [295242.086762] __do_softirq+0xd7/0x2d6 [295242.090427] irq_exit+0xf7/0x100 [295242.093748] do_IRQ+0x7f/0xd0 [295242.096806] common_interrupt+0xf/0xf [295242.100559] [295242.102750] RIP: 0033:0x7f9022e88cbd [295242.125246] RSP: 002b:00007f9022282b20 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda [295242.132900] RAX: 0000000000000005 RBX: 0000000000000010 RCX: 0000000000000000 [295242.140120] RDX: 00007f9022282ba8 RSI: 00007f9022282a30 RDI: 00007f9014005c30 [295242.147337] RBP: 00007f9014014d60 R08: 0000000000000020 R09: 00007f90254a8340 [295242.154557] R10: 00007f9022282a28 R11: 0000000000000246 R12: 0000000000000000 [295242.161775] R13: 00007f902308c000 R14: 000000000000002b R15: 00007f9022b71f40 Fixes: 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action") Signed-off-by: Paul Blakey Link: https://lore.kernel.org/r/20220223163416.24096-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/checksum.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 02d0c2d01014..79c67f14c448 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -142,6 +142,11 @@ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } +static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) +{ + *csum = csum_add(csum_sub(*csum, old), new); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); -- cgit From 29fb608396d6a62c1b85acc421ad7a4399085b9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 14 Feb 2022 17:59:38 -0800 Subject: Bluetooth: Fix bt_skb_sendmmsg not allocating partial chunks Since bt_skb_sendmmsg can be used with the likes of SOCK_STREAM it shall return the partial chunks it could allocate instead of freeing everything as otherwise it can cause problems like bellow. Fixes: 81be03e026dc ("Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg") Reported-by: Paul Menzel Link: https://lore.kernel.org/r/d7206e12-1b99-c3be-84f4-df22af427ef5@molgen.mpg.de BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215594 Signed-off-by: Luiz Augusto von Dentz Tested-by: Paul Menzel (Nokia N9 (MeeGo/Harmattan) Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 4b3d0b16c185..a647e5fabdbd 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -506,8 +506,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, tmp = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); if (IS_ERR(tmp)) { - kfree_skb(skb); - return tmp; + return skb; } len -= tmp->len; -- cgit From a56a1138cbd85e4d565356199d60e1cb94e5a77a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 17 Feb 2022 13:10:38 -0800 Subject: Bluetooth: hci_sync: Fix not using conn_timeout When using hci_le_create_conn_sync it shall wait for the conn_timeout since the connection complete may take longer than just 2 seconds. Also fix the masking of HCI_EV_LE_ENHANCED_CONN_COMPLETE and HCI_EV_LE_CONN_COMPLETE so they are never both set so we can predict which one the controller will use in case of HCI_OP_LE_CREATE_CONN. Fixes: 6cd29ec6ae5e3 ("Bluetooth: hci_sync: Wait for proper events when connecting LE") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..e336e9c1dda4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1489,6 +1489,14 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: + * + * C24: Mandatory if the LE Controller supports Connection State and either + * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported + */ +#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit From bc82c38a6933aab308387d4aca47e0a05de7b553 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 11 Feb 2022 08:10:18 +0100 Subject: tracing: Uninline trace_trigger_soft_disabled() partly On a powerpc32 build with CONFIG_CC_OPTIMISE_FOR_SIZE, the inline keyword is not honored and trace_trigger_soft_disabled() appears approx 50 times in vmlinux. Adding -Winline to the build, the following message appears: ./include/linux/trace_events.h:712:1: error: inlining failed in call to 'trace_trigger_soft_disabled': call is unlikely and code size would grow [-Werror=inline] That function is rather big for an inlined function: c003df60 : c003df60: 94 21 ff f0 stwu r1,-16(r1) c003df64: 7c 08 02 a6 mflr r0 c003df68: 90 01 00 14 stw r0,20(r1) c003df6c: bf c1 00 08 stmw r30,8(r1) c003df70: 83 e3 00 24 lwz r31,36(r3) c003df74: 73 e9 01 00 andi. r9,r31,256 c003df78: 41 82 00 10 beq c003df88 c003df7c: 38 60 00 00 li r3,0 c003df80: 39 61 00 10 addi r11,r1,16 c003df84: 4b fd 60 ac b c0014030 <_rest32gpr_30_x> c003df88: 73 e9 00 80 andi. r9,r31,128 c003df8c: 7c 7e 1b 78 mr r30,r3 c003df90: 41 a2 00 14 beq c003dfa4 c003df94: 38 c0 00 00 li r6,0 c003df98: 38 a0 00 00 li r5,0 c003df9c: 38 80 00 00 li r4,0 c003dfa0: 48 05 c5 f1 bl c009a590 c003dfa4: 73 e9 00 40 andi. r9,r31,64 c003dfa8: 40 82 00 28 bne c003dfd0 c003dfac: 73 ff 02 00 andi. r31,r31,512 c003dfb0: 41 82 ff cc beq c003df7c c003dfb4: 80 01 00 14 lwz r0,20(r1) c003dfb8: 83 e1 00 0c lwz r31,12(r1) c003dfbc: 7f c3 f3 78 mr r3,r30 c003dfc0: 83 c1 00 08 lwz r30,8(r1) c003dfc4: 7c 08 03 a6 mtlr r0 c003dfc8: 38 21 00 10 addi r1,r1,16 c003dfcc: 48 05 6f 6c b c0094f38 c003dfd0: 38 60 00 01 li r3,1 c003dfd4: 4b ff ff ac b c003df80 However it is located in a hot path so inlining it is important. But forcing inlining of the entire function by using __always_inline leads to increasing the text size by approx 20 kbytes. Instead, split the fonction in two parts, one part with the likely fast path, flagged __always_inline, and a second part out of line. With this change, on a powerpc32 with CONFIG_CC_OPTIMISE_FOR_SIZE vmlinux text increases by only 1,4 kbytes, which is partly compensated by a decrease of vmlinux data by 7 kbytes. On ppc64_defconfig which has CONFIG_CC_OPTIMISE_FOR_SPEED, this change reduces vmlinux text by more than 30 kbytes. Link: https://lkml.kernel.org/r/69ce0986a52d026d381d612801d978aa4f977460.1644563295.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 70c069aef02c..dcea51fb60e2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -699,6 +699,8 @@ event_triggers_post_call(struct trace_event_file *file, bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); +bool __trace_trigger_soft_disabled(struct trace_event_file *file); + /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test @@ -708,20 +710,20 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); * triggers that require testing the fields, it will return true, * otherwise false. */ -static inline bool +static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { - if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL, NULL); - if (eflags & EVENT_FILE_FL_SOFT_DISABLED) - return true; - if (eflags & EVENT_FILE_FL_PID_FILTER) - return trace_event_ignore_this_pid(file); - } - return false; + if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | + EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) + return false; + + return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS -- cgit From 17a8f31bba7bac8cce4bd12bab50697da96e7710 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 04:18:05 +0100 Subject: netfilter: egress: silence egress hook lockdep splats Netfilter assumes its called with rcu_read_lock held, but in egress hook case it may be called with BH readlock. This triggers lockdep splat. In order to avoid to change all rcu_dereference() to rcu_dereference_check(..., rcu_read_lock_bh_held()), wrap nf_hook_slow with read lock/unlock pair. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index b4dd96e4dc8d..e6487a691136 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -101,7 +101,11 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, dev, NULL, NULL, dev_net(dev), NULL); + + /* nf assumes rcu_read_lock, not just read_lock_bh */ + rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); + rcu_read_unlock(); if (ret == 1) { return skb; -- cgit From 50bb467c9e76743fbc8441d29113cdad62dbc4fe Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 18 Feb 2022 09:38:58 +0000 Subject: rfkill: define rfill_soft_blocked() if !RFKILL If CONFIG_RFKILL is not set, the Intel WiFi driver will not build the iw_mvm driver part due to the missing rfill_soft_blocked() call. Adding a inline declaration of rfill_soft_blocked() if CONFIG_RFKILL=n fixes the following error: drivers/net/wireless/intel/iwlwifi/mvm/mvm.h: In function 'iwl_mvm_mei_set_sw_rfkill_state': drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:2215:38: error: implicit declaration of function 'rfkill_soft_blocked'; did you mean 'rfkill_blocked'? [-Werror=implicit-function-declaration] 2215 | mvm->hw_registered ? rfkill_soft_blocked(mvm->hw->wiphy->rfkill) : false; | ^~~~~~~~~~~~~~~~~~~ | rfkill_blocked Signed-off-by: Ben Dooks Reported-by: Neill Whillans Fixes: 5bc9a9dd7535 ("rfkill: allow to get the software rfkill state") Link: https://lore.kernel.org/r/20220218093858.1245677-1-ben.dooks@codethink.co.uk Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index c35f3962dc4f..373003ace639 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -308,6 +308,11 @@ static inline bool rfkill_blocked(struct rfkill *rfkill) return false; } +static inline bool rfkill_soft_blocked(struct rfkill *rfkill) +{ + return false; +} + static inline enum rfkill_type rfkill_find_type(const char *name) { return RFKILL_TYPE_ALL; -- cgit From c3873070247d9e3c7a6b0cf9bf9b45e8018427b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 06:22:22 +0100 Subject: netfilter: nf_queue: fix possible use-after-free Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 9eed51e920e8..980daa6e1e3a 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) -- cgit From db6140e5e35a48405e669353bd54042c1d4c3841 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Mon, 28 Feb 2022 11:23:49 +0200 Subject: net/sched: act_ct: Fix flow table lookup failure with no originating ifindex After cited commit optimizted hw insertion, flow table entries are populated with ifindex information which was intended to only be used for HW offload. This tuple ifindex is hashed in the flow table key, so it must be filled for lookup to be successful. But tuple ifindex is only relevant for the netfilter flowtables (nft), so it's not filled in act_ct flow table lookup, resulting in lookup failure, and no SW offload and no offload teardown for TCP connection FIN/RST packets. To fix this, add new tc ifindex field to tuple, which will only be used for offloading, not for lookup, as it will not be part of the tuple hash. Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a3647fadf1cc..bd59e950f4d6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -96,6 +96,7 @@ enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, + FLOW_OFFLOAD_XMIT_TC, }; #define NF_FLOW_TABLE_ENCAP_MAX 2 @@ -127,7 +128,7 @@ struct flow_offload_tuple { struct { } __hash; u8 dir:2, - xmit_type:2, + xmit_type:3, encap_num:2, in_vlan_ingress:2; u16 mtu; @@ -142,6 +143,9 @@ struct flow_offload_tuple { u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; + struct { + u32 iifidx; + } tc; }; }; -- cgit From 2d3916f3189172d5c69d33065c3c21119fe539fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..47ffb360ddfa 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -475,9 +475,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL -- cgit From bfa26ba343c727e055223be04e08f2ebdd43c293 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:23:42 -0800 Subject: HID: add mapping for KEY_DICTATE Numerous keyboards are adding dictate keys which allows for text messages to be dictated by a microphone. This patch adds a new key definition KEY_DICTATE and maps 0x0c/0x0d8 usage code to this new keycode. Additionally hid-debug is adjusted to recognize this new usage code as well. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303021501.1.I5dbf50eb1a7a6734ee727bda4a8573358c6d3ec0@changeid Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 225ec87d4f22..4db5d41848e4 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -612,6 +612,7 @@ #define KEY_ASSISTANT 0x247 /* AL Context-aware desktop assistant */ #define KEY_KBD_LAYOUT_NEXT 0x248 /* AC Next Keyboard Layout Select */ #define KEY_EMOJI_PICKER 0x249 /* Show/hide emoji picker (HUTRR101) */ +#define KEY_DICTATE 0x24a /* Start or Stop Voice Dictation Session (HUTRR99) */ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ -- cgit From 327b89f0acc4c20a06ed59e4d9af7f6d804dc2e2 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:26:22 -0800 Subject: HID: add mapping for KEY_ALL_APPLICATIONS This patch adds a new key definition for KEY_ALL_APPLICATIONS and aliases KEY_DASHBOARD to it. It also maps the 0x0c/0x2a2 usage code to KEY_ALL_APPLICATIONS. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303035618.1.I3a7746ad05d270161a18334ae06e3b6db1a1d339@changeid Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 4db5d41848e4..7989d9483ea7 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -278,7 +278,8 @@ #define KEY_PAUSECD 201 #define KEY_PROG3 202 #define KEY_PROG4 203 -#define KEY_DASHBOARD 204 /* AL Dashboard */ +#define KEY_ALL_APPLICATIONS 204 /* AC Desktop Show All Applications */ +#define KEY_DASHBOARD KEY_ALL_APPLICATIONS #define KEY_SUSPEND 205 #define KEY_CLOSE 206 /* AC Close */ #define KEY_PLAY 207 -- cgit From 838d6d3461db0fdbf33fc5f8a69c27b50b4a46da Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:56:15 -0500 Subject: virtio: unexport virtio_finalize_features virtio_finalize_features is only used internally within virtio. No reason to export it. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Acked-by: Jason Wang --- include/linux/virtio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 72292a62cd90..5464f398912a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -133,7 +133,6 @@ bool is_virtio_device(struct device *dev); void virtio_break_device(struct virtio_device *dev); void virtio_config_changed(struct virtio_device *dev); -int virtio_finalize_features(struct virtio_device *dev); #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); -- cgit From 4fa59ede95195f267101a1b8916992cf3f245cdb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:58:41 -0500 Subject: virtio: acknowledge all features before access The feature negotiation was designed in a way that makes it possible for devices to know which config fields will be accessed by drivers. This is broken since commit 404123c2db79 ("virtio: allow drivers to validate features") with fallout in at least block and net. We have a partial work-around in commit 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") which at least lets devices find out which format should config space have, but this is a partial fix: guests should not access config space without acknowledging features since otherwise we'll never be able to change the config space format. To fix, split finalize_features from virtio_finalize_features and call finalize_features with all feature bits before validation, and then - if validation changed any bits - once again after. Since virtio_finalize_features no longer writes out features rename it to virtio_features_ok - since that is what it does: checks that features are ok with the device. As a side effect, this also reduces the amount of hypervisor accesses - we now only acknowledge features once unless we are clearing any features when validating (which is uncommon). IRC I think that this was more or less always the intent in the spec but unfortunately the way the spec is worded does not say this explicitly, I plan to address this at the spec level, too. Acked-by: Jason Wang Cc: stable@vger.kernel.org Fixes: 404123c2db79 ("virtio: allow drivers to validate features") Fixes: 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") Cc: "Halil Pasic" Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 4d107ad31149..dafdc7f48c01 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -64,8 +64,9 @@ struct virtio_shm_region { * Returns the first 64 feature bits (all we currently need). * @finalize_features: confirm what device features we'll be using. * vdev: the virtio_device - * This gives the final feature bits for the device: it can change + * This sends the driver feature bits to the device: it can change * the dev->feature bits if it wants. + * Note: despite the name this can be called any number of times. * Returns 0 on success or error status * @bus_name: return the bus name associated with the device (optional) * vdev: the virtio_device -- cgit From e0077cc13b831f8fad5557442f73bf7728683713 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:27:59 -0500 Subject: vdpa: factor out vdpa_set_features_unlocked for vdpa internal use No functional change introduced. vdpa bus driver such as virtio_vdpa or vhost_vdpa is not supposed to take care of the locking for core by its own. The locked API vdpa_set_features should suffice the bus driver's need. Signed-off-by: Si-Wei Liu Reviewed-by: Eli Cohen Link: https://lore.kernel.org/r/1642206481-30721-2-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- include/linux/vdpa.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2de442ececae..721089bb4c84 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -401,18 +401,24 @@ static inline int vdpa_reset(struct vdpa_device *vdev) return ret; } -static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features, bool locked) +static inline int vdpa_set_features_unlocked(struct vdpa_device *vdev, u64 features) { const struct vdpa_config_ops *ops = vdev->config; int ret; - if (!locked) - mutex_lock(&vdev->cf_mutex); - vdev->features_valid = true; ret = ops->set_driver_features(vdev, features); - if (!locked) - mutex_unlock(&vdev->cf_mutex); + + return ret; +} + +static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) +{ + int ret; + + mutex_lock(&vdev->cf_mutex); + ret = vdpa_set_features_unlocked(vdev, features); + mutex_unlock(&vdev->cf_mutex); return ret; } -- cgit From 5c26f6ac9416b63d093e29c30e79b3297e425472 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:51 -0800 Subject: mm: refactor vm_area_struct::anon_vma_name usage code Avoid mixing strings and their anon_vma_name referenced pointers by using struct anon_vma_name whenever possible. This simplifies the code and allows easier sharing of anon_vma_name structures when they represent the same name. [surenb@google.com: fix comment] Link: https://lkml.kernel.org/r/20220223153613.835563-1-surenb@google.com Link: https://lkml.kernel.org/r/20220224231834.1481408-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Colin Cross Cc: Sumit Semwal Cc: Dave Hansen Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Johannes Weiner Cc: "Eric W. Biederman" Cc: Christian Brauner Cc: Alexey Gladkov Cc: Sasha Levin Cc: Chris Hyser Cc: Davidlohr Bueso Cc: Peter Collingbourne Cc: Xiaofeng Cao Cc: David Hildenbrand Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 ++-- include/linux/mm_inline.h | 87 ++++++++++++++++++++++++++++++++--------------- include/linux/mm_types.h | 5 ++- 3 files changed, 67 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..5744a3fc4716 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2626,7 +2626,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, const char *); + struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3372,11 +3372,12 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name); + unsigned long len_in, + struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) { + unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..dd3accaa4e6d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -140,50 +140,81 @@ static __always_inline void del_page_from_lru_list(struct page *page, #ifdef CONFIG_ANON_VMA_NAME /* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. */ -extern const char *vma_anon_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name_alloc(const char *name); +extern void anon_vma_name_free(struct kref *kref); -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); +/* mmap_lock should be read-locked */ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_get(&anon_name->kref); +} -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, anon_vma_name_free); +} -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + struct anon_vma_name *anon_name = anon_vma_name(orig_vma); + + if (anon_name) { + anon_vma_name_get(anon_name); + new_vma->anon_name = anon_name; + } +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) { - const char *vma_name = vma_anon_name(vma); + /* + * Not using anon_vma_name because it generates a warning if mmap_lock + * is not held, which might be the case here. + */ + if (!vma->vm_file) + anon_vma_name_put(vma->anon_name); +} - /* either both NULL, or pointers to same string */ - if (vma_name == name) +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + if (anon_name1 == anon_name2) return true; - return name && vma_name && !strcmp(name, vma_name); + return anon_name1 && anon_name2 && + !strcmp(anon_name1->name, anon_name2->name); } + #else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} + +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_anon_vma_name(struct vm_area_struct *vma) {} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) { return true; } + #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb486..0f549870da6a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -416,7 +416,10 @@ struct vm_area_struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; - /* Serialized by mmap_sem. */ + /* + * Serialized by mmap_sem. Never use directly because it is + * valid only when vm_file is NULL. Use anon_vma_name instead. + */ struct anon_vma_name *anon_name; }; -- cgit From 96403e11283def1d1c465c8279514c9a504d8630 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:55 -0800 Subject: mm: prevent vm_area_struct::anon_name refcount saturation A deep process chain with many vmas could grow really high. With default sysctl_max_map_count (64k) and default pid_max (32k) the max number of vmas in the system is 2147450880 and the refcounter has headroom of 1073774592 before it reaches REFCOUNT_SATURATED (3221225472). Therefore it's unlikely that an anonymous name refcounter will overflow with these defaults. Currently the max for pid_max is PID_MAX_LIMIT (4194304) and for sysctl_max_map_count it's INT_MAX (2147483647). In this configuration anon_vma_name refcount overflow becomes theoretically possible (that still require heavy sharing of that anon_vma_name between processes). kref refcounting interface used in anon_vma_name structure will detect a counter overflow when it reaches REFCOUNT_SATURATED value but will only generate a warning and freeze the ref counter. This would lead to the refcounted object never being freed. A determined attacker could leak memory like that but it would be rather expensive and inefficient way to do so. To ensure anon_vma_name refcount does not overflow, stop anon_vma_name sharing when the refcount reaches REFCOUNT_MAX (2147483647), which still leaves INT_MAX/2 (1073741823) values before the counter reaches REFCOUNT_SATURATED. This should provide enough headroom for raising the refcounts temporarily. Link: https://lkml.kernel.org/r/20220223153613.835563-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Alexey Gladkov Cc: Chris Hyser Cc: Christian Brauner Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Peter Collingbourne Cc: Sasha Levin Cc: Sumit Semwal Cc: Vlastimil Babka Cc: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index dd3accaa4e6d..cf90b1fa2c60 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -161,15 +161,25 @@ static inline void anon_vma_name_put(struct anon_vma_name *anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } +static inline +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) +{ + /* Prevent anon_name refcount saturation early on */ + if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { + anon_vma_name_get(anon_name); + return anon_name; + + } + return anon_vma_name_alloc(anon_name->name); +} + static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); - if (anon_name) { - anon_vma_name_get(anon_name); - new_vma->anon_name = anon_name; - } + if (anon_name) + new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) -- cgit From 6b1775f26a2da2b05a6dc8ec2b5d14e9a4701a1a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen/grant-table: add gnttab_try_end_foreign_access() Add a new grant table function gnttab_try_end_foreign_access(), which will remove and free a grant if it is not in use. Its main use case is to either free a grant if it is no longer in use, or to take some other action if it is still in use. This other action can be an error exit, or (e.g. in the case of blkfront persistent grant feature) some special handling. This is CVE-2022-23036, CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - new patch V4: - add comments to header (Jan Beulich) --- include/xen/grant_table.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index cb854df031ce..358d2817741b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -104,10 +104,22 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. + * Note that the granted page might still be accessed (read or write) by the + * other side after gnttab_end_foreign_access() returns, so even if page was + * specified as 0 it is not allowed to just reuse the page for other + * purposes immediately. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); +/* + * End access through the given grant reference, iff the grant entry is + * no longer in use. In case of success ending foreign access, the + * grant reference is deallocated. + * Return 1 if the grant entry was freed, 0 if it is still in use. + */ +int gnttab_try_end_foreign_access(grant_ref_t ref); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); -- cgit From 1dbd11ca75fe664d3e54607547771d021f531f59 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: xen: remove gnttab_query_foreign_access() Remove gnttab_query_foreign_access(), as it is unused and unsafe to use. All previous use cases assumed a grant would not be in use after gnttab_query_foreign_access() returned 0. This information is useless in best case, as it only refers to a situation in the past, which could have changed already. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- include/xen/grant_table.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 358d2817741b..ab9e692a0ef4 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -125,8 +125,6 @@ int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); -int gnttab_query_foreign_access(grant_ref_t ref); - /* * operations on reserved batches of grant references */ -- cgit From 42baefac638f06314298087394b982ead9ec444b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: xen/gnttab: fix gnttab_end_foreign_access() without page specified gnttab_end_foreign_access() is used to free a grant reference and optionally to free the associated page. In case the grant is still in use by the other side processing is being deferred. This leads to a problem in case no page to be freed is specified by the caller: the caller doesn't know that the page is still mapped by the other side and thus should not be used for other purposes. The correct way to handle this situation is to take an additional reference to the granted page in case handling is being deferred and to drop that reference when the grant reference could be freed finally. This requires that there are no users of gnttab_end_foreign_access() left directly repurposing the granted page after the call, as this might result in clobbered data or information leaks via the not yet freed grant reference. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - expand comment in header V5: - get page ref in case of kmalloc() failure, too --- include/xen/grant_table.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index ab9e692a0ef4..c9fea9389ebe 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -107,7 +107,12 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * Note that the granted page might still be accessed (read or write) by the * other side after gnttab_end_foreign_access() returns, so even if page was * specified as 0 it is not allowed to just reuse the page for other - * purposes immediately. + * purposes immediately. gnttab_end_foreign_access() will take an additional + * reference to the granted page in this case, which is dropped only after + * the grant is no longer in use. + * This requires that multi page allocations for areas subject to + * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing + * via free_pages_exact()) in order to avoid high order pages. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); -- cgit From ebe48d368e97d007bfeb76fcb065d6cfc4c96645 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: esp: Fix possible buffer overflow in ESP transformation The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..90cd02ff77ef 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) -- cgit From 23c7f8d7989e1646aac82f75761b7648c355cb8a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:41 +0100 Subject: net: Fix esp GSO on inter address family tunnels. The esp tunnel GSO handlers use skb_mac_gso_segment to push the inner packet to the segmentation handlers. However, skb_mac_gso_segment takes the Ethernet Protocol ID from 'skb->protocol' which is wrong for inter address family tunnels. We fix this by introducing a new skb_eth_gso_segment function. This function can be used if it is necessary to pass the Ethernet Protocol ID directly to the segmentation handler. First users of this function will be the esp4 and esp6 tunnel segmentation handlers. Fixes: c35fe4106b92 ("xfrm: Add mode handlers for IPsec on layer 2") Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8b5a314db167..f53ea7038441 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4602,6 +4602,8 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); -- cgit From aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 5 Mar 2022 18:07:14 +0100 Subject: swiotlb: rework "fix info leak with DMA_FROM_DEVICE" Unfortunately, we ended up merging an old version of the patch "fix info leak with DMA_FROM_DEVICE" instead of merging the latest one. Christoph (the swiotlb maintainer), he asked me to create an incremental fix (after I have pointed this out the mix up, and asked him for guidance). So here we go. The main differences between what we got and what was agreed are: * swiotlb_sync_single_for_device is also required to do an extra bounce * We decided not to introduce DMA_ATTR_OVERWRITE until we have exploiters * The implantation of DMA_ATTR_OVERWRITE is flawed: DMA_ATTR_OVERWRITE must take precedence over DMA_ATTR_SKIP_CPU_SYNC Thus this patch removes DMA_ATTR_OVERWRITE, and makes swiotlb_sync_single_for_device() bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer. Let me note, that if the size used with dma_sync_* API is less than the size used with dma_[un]map_*, under certain circumstances we may still end up with swiotlb not being transparent. In that sense, this is no perfect fix either. To get this bullet proof, we would have to bounce the entire mapping/bounce buffer. For that we would have to figure out the starting address, and the size of the mapping in swiotlb_sync_single_for_device(). While this does seem possible, there seems to be no firm consensus on how things are supposed to work. Signed-off-by: Halil Pasic Fixes: ddbd89deb7d3 ("swiotlb: fix info leak with DMA_FROM_DEVICE") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6150d11a607e..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,14 +61,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a -- cgit From ac77998b7ac3044f0509b097da9637184598980d Mon Sep 17 00:00:00 2001 From: Mohammad Kabat Date: Thu, 25 Mar 2021 14:38:55 +0200 Subject: net/mlx5: Fix size field in bufferx_reg struct According to HW spec the field "size" should be 16 bits in bufferx register. Fixes: e281682bf294 ("net/mlx5_core: HW data structs/types definitions cleanup") Signed-off-by: Mohammad Kabat Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..5743f5b3414b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9900,8 +9900,8 @@ struct mlx5_ifc_bufferx_reg_bits { u8 reserved_at_0[0x6]; u8 lossy[0x1]; u8 epsb[0x1]; - u8 reserved_at_8[0xc]; - u8 size[0xc]; + u8 reserved_at_8[0x8]; + u8 size[0x10]; u8 xoff_threshold[0x10]; u8 xon_threshold[0x10]; -- cgit From 99a2b9be077ae3a5d97fbf5f7782e0f2e9812978 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 2 Mar 2022 17:07:08 +0200 Subject: net/mlx5e: SHAMPO, reduce TIR indication SHAMPO is an RQ / WQ feature, an indication was added to the TIR in the first place to enforce suitability between connected TIR and RQ, this enforcement does not exist in current the Firmware implementation and was redundant in the first place. Fixes: 83439f3c37aa ("net/mlx5e: Add HW-GRO offload") Signed-off-by: Ben Ben-Ishay Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5743f5b3414b..49a48d7709ac 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3434,7 +3434,6 @@ enum { enum { MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO = BIT(0), MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO = BIT(1), - MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO = BIT(2), }; enum { -- cgit From 26183cfe478c1d1d5cd1e3920a4b2c5b1980849d Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Tue, 8 Mar 2022 22:25:44 -0800 Subject: net: phy: correct spelling error of media in documentation The header file incorrectly referenced "median-independant interface" instead of media. Correct this typo. Signed-off-by: Colin Foster Fixes: 4069a572d423 ("net: phy: Document core PHY structures") Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220309062544.3073-1-colin.foster@in-advantage.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6de8d7a90d78..8fa70ba063a5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -87,8 +87,8 @@ extern const int phy_10gbit_features_array[1]; * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined - * @PHY_INTERFACE_MODE_MII: Median-independent interface - * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface + * @PHY_INTERFACE_MODE_MII: Media-independent interface + * @PHY_INTERFACE_MODE_GMII: Gigabit media-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface -- cgit From c993ee0f9f81caf5767a50d1faeba39a0dc82af2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:31 +0000 Subject: watch_queue: Fix filter limit check In watch_queue_set_filter(), there are a couple of places where we check that the filter type value does not exceed what the type_filter bitmap can hold. One place calculates the number of bits by: if (tf[i].type >= sizeof(wfilter->type_filter) * 8) which is fine, but the second does: if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) which is not. This can lead to a couple of out-of-bounds writes due to a too-large type: (1) __set_bit() on wfilter->type_filter (2) Writing more elements in wfilter->filters[] than we allocated. Fix this by just using the proper WATCH_TYPE__NR instead, which is the number of types we actually know about. The bug may cause an oops looking something like: BUG: KASAN: slab-out-of-bounds in watch_queue_set_filter+0x659/0x740 Write of size 4 at addr ffff88800d2c66bc by task watch_queue_oob/611 ... Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x150 ... kasan_report.cold+0x7f/0x11b ... watch_queue_set_filter+0x659/0x740 ... __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 611: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 watch_queue_set_filter+0x23a/0x740 __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88800d2c66a0 which belongs to the cache kmalloc-32 of size 32 The buggy address is located 28 bytes inside of 32-byte region [ffff88800d2c66a0, ffff88800d2c66c0) Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/watch_queue.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index c994d1b2cdba..3b9a40ae8bdb 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -28,7 +28,8 @@ struct watch_type_filter { struct watch_filter { union { struct rcu_head rcu; - unsigned long type_filter[2]; /* Bitmask of accepted types */ + /* Bitmask of accepted types */ + DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; -- cgit From 413a4a6b0b5553f2423d210f65e98c211b99c3f8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 16:02:18 +0000 Subject: cachefiles: Fix volume coherency attribute A network filesystem may set coherency data on a volume cookie, and if given, cachefiles will store this in an xattr on the directory in the cache corresponding to the volume. The function that sets the xattr just stores the contents of the volume coherency buffer directly into the xattr, with nothing added; the checking function, on the other hand, has a cut'n'paste error whereby it tries to interpret the xattr contents as would be the xattr on an ordinary file (using the cachefiles_xattr struct). This results in a failure to match the coherency data because the buffer ends up being shifted by 18 bytes. Fix this by defining a structure specifically for the volume xattr and making both the setting and checking functions use it. Since the volume coherency doesn't work if used, take the opportunity to insert a reserved field for future use, set it to 0 and check that it is 0. Log mismatch through the appropriate tracepoint. Note that this only affects cifs; 9p, afs, ceph and nfs don't use the volume coherency data at the moment. Fixes: 32e150037dce ("fscache, cachefiles: Store the volume coherency data") Reported-by: Rohith Surabattula Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Steve French cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Signed-off-by: Linus Torvalds --- include/trace/events/cachefiles.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index c6f5aa74db89..2c530637e10a 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -56,6 +56,7 @@ enum cachefiles_coherency_trace { cachefiles_coherency_set_ok, cachefiles_coherency_vol_check_cmp, cachefiles_coherency_vol_check_ok, + cachefiles_coherency_vol_check_resv, cachefiles_coherency_vol_check_xattr, cachefiles_coherency_vol_set_fail, cachefiles_coherency_vol_set_ok, @@ -139,6 +140,7 @@ enum cachefiles_error_trace { EM(cachefiles_coherency_set_ok, "SET ok ") \ EM(cachefiles_coherency_vol_check_cmp, "VOL BAD cmp ") \ EM(cachefiles_coherency_vol_check_ok, "VOL OK ") \ + EM(cachefiles_coherency_vol_check_resv, "VOL BAD resv") \ EM(cachefiles_coherency_vol_check_xattr,"VOL BAD xatt") \ EM(cachefiles_coherency_vol_set_fail, "VOL SET fail") \ E_(cachefiles_coherency_vol_set_ok, "VOL SET ok ") -- cgit