From 65e68edce0db433aa0c2b26d7dc14fbbbeb89fbb Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Tue, 24 Sep 2019 15:14:52 +0200 Subject: nvme: allow 64-bit results in passthru commands It is not possible to get 64-bit results from the passthru commands, what prevents from getting for the Capabilities (CAP) property value. As a result, it is not possible to implement IOL's NVMe Conformance test 4.3 Case 1 for Fabrics targets [1] (page 123). This issue has been already discussed [2], but without a solution. This patch solves the problem by adding new ioctls with a new passthru structure, including 64-bit results. The older ioctls stay unchanged. [1] https://www.iol.unh.edu/sites/default/files/testsuites/nvme/UNH-IOL_NVMe_Conformance_Test_Suite_v11.0.pdf [2] http://lists.infradead.org/pipermail/linux-nvme/2018-June/018791.html Signed-off-by: Marta Rybczynska Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- include/uapi/linux/nvme_ioctl.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 1c215ea1798e..e168dc59e9a0 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -45,6 +45,27 @@ struct nvme_passthru_cmd { __u32 result; }; +struct nvme_passthru_cmd64 { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u64 result; +}; + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) @@ -54,5 +75,7 @@ struct nvme_passthru_cmd { #define NVME_IOCTL_RESET _IO('N', 0x44) #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) #define NVME_IOCTL_RESCAN _IO('N', 0x46) +#define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ -- cgit From 833b45de69a6016c4b0cebe6765d526a31a81580 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 30 Sep 2019 18:48:44 +0200 Subject: kvm: x86, powerpc: do not allow clearing largepages debugfs entry The largepages debugfs entry is incremented/decremented as shadow pages are created or destroyed. Clearing it will result in an underflow, which is harmless to KVM but ugly (and could be misinterpreted by tools that use debugfs information), so make this particular statistic read-only. Cc: kvm-ppc@vger.kernel.org Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fcb46b3374c6..719fc3e15ea4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1090,6 +1090,7 @@ enum kvm_stat_kind { struct kvm_stat_data { int offset; + int mode; struct kvm *kvm; }; @@ -1097,6 +1098,7 @@ struct kvm_stats_debugfs_item { const char *name; int offset; enum kvm_stat_kind kind; + int mode; }; extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; -- cgit From 61129dd29f7962f278b618a2a3e8fdb986a66dc8 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 17 Sep 2019 09:18:53 +0200 Subject: sched: Add __ASSEMBLY__ guards around struct clone_args The addition of struct clone_args to uapi/linux/sched.h is not protected by __ASSEMBLY__ guards, causing a failure to build from source for glibc on RISC-V. Add the guards to fix this. Fixes: 7f192e3cd316 ("fork: add clone3") Signed-off-by: Seth Forshee Cc: Acked-by: Ingo Molnar Link: https://lore.kernel.org/r/20190917071853.12385-1-seth.forshee@canonical.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..851ff1feadd5 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,7 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +#ifndef __ASSEMBLY__ /* * Arguments for the clone3 syscall */ @@ -46,6 +47,7 @@ struct clone_args { __aligned_u64 stack_size; __aligned_u64 tls; }; +#endif /* * Scheduling policies -- cgit From f5a1a536fa14895ccff4e94e6a5af90901ce86aa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:52 +1000 Subject: lib: introduce copy_struct_from_user() helper A common pattern for syscall extensions is increasing the size of a struct passed from userspace, such that the zero-value of the new fields result in the old kernel behaviour (allowing for a mix of userspace and kernel vintages to operate on one another in most cases). While this interface exists for communication in both directions, only one interface is straightforward to have reasonable semantics for (userspace passing a struct to the kernel). For kernel returns to userspace, what the correct semantics are (whether there should be an error if userspace is unaware of a new extension) is very syscall-dependent and thus probably cannot be unified between syscalls (a good example of this problem is [1]). Previously there was no common lib/ function that implemented the necessary extension-checking semantics (and different syscalls implemented them slightly differently or incompletely[2]). Future patches replace common uses of this pattern to make use of copy_struct_from_user(). Some in-kernel selftests that insure that the handling of alignment and various byte patterns are all handled identically to memchr_inv() usage. [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code") [2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do similar checks to copy_struct_from_user() while rt_sigprocmask(2) always rejects differently-sized struct arguments. Suggested-by: Rasmus Villemoes Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191001011055.19283-2-cyphar@cyphar.com Signed-off-by: Christian Brauner --- include/linux/bitops.h | 7 +++++ include/linux/uaccess.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cf074bce3eb3..c94a9ff9f082 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,6 +4,13 @@ #include #include +/* Set bits in the first 'n' bytes when loaded from memory */ +#ifdef __LITTLE_ENDIAN +# define aligned_byte_mask(n) ((1UL << 8*(n))-1) +#else +# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) +#endif + #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 70bbdc38dc37..e47d0522a1f4 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -231,6 +231,76 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, #endif /* ARCH_HAS_NOCACHE_UACCESS */ +extern __must_check int check_zeroed_user(const void __user *from, size_t size); + +/** + * copy_struct_from_user: copy a struct from userspace + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @src: Source address, in userspace. + * @usize: (Alleged) size of @src struct. + * + * Copies a struct from userspace to kernel space, in a way that guarantees + * backwards-compatibility for struct syscall arguments (as long as future + * struct extensions are made such that all new fields are *appended* to the + * old struct, and zeroed-out new fields have the same meaning as the old + * struct). + * + * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. + * The recommended usage is something like the following: + * + * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) + * { + * int err; + * struct foo karg = {}; + * + * if (usize > PAGE_SIZE) + * return -E2BIG; + * if (usize < FOO_SIZE_VER0) + * return -EINVAL; + * + * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + * if (err) + * return err; + * + * // ... + * } + * + * There are three cases to consider: + * * If @usize == @ksize, then it's copied verbatim. + * * If @usize < @ksize, then the userspace has passed an old struct to a + * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) + * are to be zero-filled. + * * If @usize > @ksize, then the userspace has passed a new struct to an + * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) + * are checked to ensure they are zeroed, otherwise -E2BIG is returned. + * + * Returns (in all cases, some data may have been copied): + * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. + * * -EFAULT: access to userspace failed. + */ +static __always_inline __must_check int +copy_struct_from_user(void *dst, size_t ksize, const void __user *src, + size_t usize) +{ + size_t size = min(ksize, usize); + size_t rest = max(ksize, usize) - size; + + /* Deal with trailing bytes. */ + if (usize < ksize) { + memset(dst + size, 0, rest); + } else if (usize > ksize) { + int ret = check_zeroed_user(src + size, rest); + if (ret <= 0) + return ret ?: -E2BIG; + } + /* Copy the interoperable parts of the struct. */ + if (copy_from_user(dst, src, size)) + return -EFAULT; + return 0; +} + /* * probe_kernel_read(): safely attempt to read from a location * @dst: pointer to the buffer that shall take the data -- cgit From f14c234b4bc5184fd40d9a47830e5b32c3b36d49 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:53 +1000 Subject: clone3: switch to copy_struct_from_user() Switch clone3() syscall from it's own copying struct clone_args from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Additionally, explicitly define CLONE_ARGS_SIZE_VER0 to match the other users of the struct-extension pattern. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-3-cyphar@cyphar.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..0945805982b4 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -47,6 +47,8 @@ struct clone_args { __aligned_u64 tls; }; +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ + /* * Scheduling policies */ -- cgit From 895b5c9f206eb7d25dc1360a8ccfc5958895eb89 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 29 Sep 2019 20:54:03 +0200 Subject: netfilter: drop bridge nf reset from nf_reset commit 174e23810cd31 ("sk_buff: drop all skb extensions on free and skb scrubbing") made napi recycle always drop skb extensions. The additional skb_ext_del() that is performed via nf_reset on napi skb recycle is not needed anymore. Most nf_reset() calls in the stack are there so queued skb won't block 'rmmod nf_conntrack' indefinitely. This removes the skb_ext_del from nf_reset, and renames it to a more fitting nf_reset_ct(). In a few selected places, add a call to skb_ext_reset to make sure that no active extensions remain. I am submitting this for "net", because we're still early in the release cycle. The patch applies to net-next too, but I think the rename causes needless divergence between those trees. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7d3b1a513ef..4351577b14d7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4160,15 +4160,12 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} #endif /* CONFIG_SKB_EXTENSIONS */ -static inline void nf_reset(struct sk_buff *skb) +static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_del(skb, SKB_EXT_BRIDGE_NF); -#endif } static inline void nf_reset_trace(struct sk_buff *skb) -- cgit From 30945d31e5761436d9eba6b8cff468a5f7c9c266 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 24 Sep 2019 14:49:43 +0200 Subject: hwmon: Fix HWMON_P_MIN_ALARM mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both HWMON_P_MIN_ALARM and HWMON_P_MAX_ALARM were using BIT(hwmon_power_max_alarm). Fixes: aa7f29b07c870 ("hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm") CC: Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20190924124945.491326-2-nuno.sa@analog.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 04c36b7a61dd..72579168189d 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -235,7 +235,7 @@ enum hwmon_power_attributes { #define HWMON_P_LABEL BIT(hwmon_power_label) #define HWMON_P_ALARM BIT(hwmon_power_alarm) #define HWMON_P_CAP_ALARM BIT(hwmon_power_cap_alarm) -#define HWMON_P_MIN_ALARM BIT(hwmon_power_max_alarm) +#define HWMON_P_MIN_ALARM BIT(hwmon_power_min_alarm) #define HWMON_P_MAX_ALARM BIT(hwmon_power_max_alarm) #define HWMON_P_LCRIT_ALARM BIT(hwmon_power_lcrit_alarm) #define HWMON_P_CRIT_ALARM BIT(hwmon_power_crit_alarm) -- cgit From 09515706857a7d5a2ffb5ce6a44c0bc7859a745b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 1 Oct 2019 10:25:34 +0200 Subject: xen/efi: have a common runtime setup function Today the EFI runtime functions are setup in architecture specific code (x86 and arm), with the functions themselves living in drivers/xen as they are not architecture dependent. As the setup is exactly the same for arm and x86 move the setup to drivers/xen, too. This at once removes the need to make the single functions global visible. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich [boris: "Dropped EXPORT_SYMBOL_GPL(xen_efi_runtime_setup)"] Signed-off-by: Boris Ostrovsky --- include/xen/xen-ops.h | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'include') diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 98b30c1613b2..d89969aa9942 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -212,30 +212,7 @@ int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr, bool xen_running_on_version_or_later(unsigned int major, unsigned int minor); -efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc); -efi_status_t xen_efi_set_time(efi_time_t *tm); -efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, - efi_time_t *tm); -efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm); -efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 *attr, unsigned long *data_size, - void *data); -efi_status_t xen_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, efi_guid_t *vendor); -efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 attr, unsigned long data_size, - void *data); -efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size); -efi_status_t xen_efi_get_next_high_mono_count(u32 *count); -efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, unsigned long sg_list); -efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, u64 *max_size, - int *reset_type); -void xen_efi_reset_system(int reset_type, efi_status_t status, - unsigned long data_size, efi_char16_t *data); +void xen_efi_runtime_setup(void); #ifdef CONFIG_PREEMPT -- cgit From 3e8db7e56082156a37b71d7334860c10fcea8025 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 21:58:19 +0300 Subject: net: dsa: sja1105: Fix sleeping while atomic in .port_hwtstamp_set Currently this stack trace can be seen with CONFIG_DEBUG_ATOMIC_SLEEP=y: [ 41.568348] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 41.576757] in_atomic(): 1, irqs_disabled(): 0, pid: 208, name: ptp4l [ 41.583212] INFO: lockdep is turned off. [ 41.587123] CPU: 1 PID: 208 Comm: ptp4l Not tainted 5.3.0-rc6-01445-ge950f2d4bc7f-dirty #1827 [ 41.599873] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 41.607584] [] (show_stack) from [] (dump_stack+0xd4/0x100) [ 41.614863] [] (dump_stack) from [] (___might_sleep+0x1c8/0x2b4) [ 41.622574] [] (___might_sleep) from [] (__mutex_lock+0x48/0xab8) [ 41.630368] [] (__mutex_lock) from [] (mutex_lock_nested+0x1c/0x24) [ 41.638340] [] (mutex_lock_nested) from [] (sja1105_static_config_reload+0x30/0x27c) [ 41.647779] [] (sja1105_static_config_reload) from [] (sja1105_hwtstamp_set+0x108/0x1cc) [ 41.657562] [] (sja1105_hwtstamp_set) from [] (dev_ifsioc+0x18c/0x330) [ 41.665788] [] (dev_ifsioc) from [] (dev_ioctl+0x320/0x6e8) [ 41.673064] [] (dev_ioctl) from [] (sock_ioctl+0x334/0x5e8) [ 41.680340] [] (sock_ioctl) from [] (do_vfs_ioctl+0xb0/0xa10) [ 41.687789] [] (do_vfs_ioctl) from [] (ksys_ioctl+0x34/0x58) [ 41.695151] [] (ksys_ioctl) from [] (ret_fast_syscall+0x0/0x28) [ 41.702768] Exception stack(0xe8495fa8 to 0xe8495ff0) [ 41.707796] 5fa0: beff4a8c 00000001 00000011 000089b0 beff4a8c beff4a80 [ 41.715933] 5fc0: beff4a8c 00000001 0000000c 00000036 b6fa98c8 004e19c1 00000001 00000000 [ 41.724069] 5fe0: 004dcedc beff4a6c 004c0738 b6e7af4c [ 41.729860] BUG: scheduling while atomic: ptp4l/208/0x00000002 [ 41.735682] INFO: lockdep is turned off. Enabling RX timestamping will logically disturb the fastpath (processing of meta frames). Replace bool hwts_rx_en with a bit that is checked atomically from the fastpath and temporarily unset from the sleepable context during a change of the RX timestamping process (a destructive operation anyways, requires switch reset). If found unset, the fastpath (net/dsa/tag_sja1105.c) will just drop any received meta frame and not take the meta_lock at all. Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 79435cfc20eb..897e799dbcb9 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -31,6 +31,8 @@ #define SJA1105_META_SMAC 0x222222222222ull #define SJA1105_META_DMAC 0x0180C200000Eull +#define SJA1105_HWTS_RX_EN 0 + /* Global tagger data: each struct sja1105_port has a reference to * the structure defined in struct sja1105_private. */ @@ -42,7 +44,7 @@ struct sja1105_tagger_data { * from taggers running on multiple ports on SMP systems */ spinlock_t meta_lock; - bool hwts_rx_en; + unsigned long state; }; struct sja1105_skb_cb { -- cgit From 815fb4c9d7da862a47c9d2a9765a4759826c5783 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Tue, 24 Sep 2019 17:53:25 -0400 Subject: drm/amdgpu: return tcc_disabled_mask to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UMDs need this for correct programming of harvested chips. Signed-off-by: Marek Olšák Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index c99b4f2482c6..4fe35d600ab8 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -1003,6 +1003,8 @@ struct drm_amdgpu_info_device { __u64 high_va_max; /* gfx10 pa_sc_tile_steering_override */ __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; }; struct drm_amdgpu_info_hw_ip { -- cgit From 3a4b46c3bc73240b31cd324b3551fc4231d9f1d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 20 Jul 2019 21:05:25 +0900 Subject: block: pg: add header include guard Add a header include guard just in case. Signed-off-by: Masahiro Yamada Signed-off-by: Jens Axboe --- include/uapi/linux/pg.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/pg.h b/include/uapi/linux/pg.h index 364c350e85cd..62b6f69bd9fb 100644 --- a/include/uapi/linux/pg.h +++ b/include/uapi/linux/pg.h @@ -35,6 +35,9 @@ */ +#ifndef _UAPI_LINUX_PG_H +#define _UAPI_LINUX_PG_H + #define PG_MAGIC 'P' #define PG_RESET 'Z' #define PG_COMMAND 'C' @@ -61,4 +64,4 @@ struct pg_read_hdr { }; -/* end of pg.h */ +#endif /* _UAPI_LINUX_PG_H */ -- cgit From 78f6face5af344f12f4bd48b32faa6f499a06f36 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 27 Sep 2019 17:29:05 +0200 Subject: sched: add kernel-doc for struct clone_args Add kernel-doc for struct clone_args for the clone3() syscall. Link: https://lore.kernel.org/r/20191001114701.24661-3-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 851ff1feadd5..3d8f03bfd428 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,30 @@ #define CLONE_IO 0x80000000 /* Clone io context */ #ifndef __ASSEMBLY__ -/* - * Arguments for the clone3 syscall +/** + * struct clone_args - arguments for the clone3 syscall + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * + * The structure is versioned by size and thus extensible. + * New struct members must go at the end of the struct and + * must be properly 64bit aligned. */ struct clone_args { __aligned_u64 flags; -- cgit From 47934ef7f1883209120781b59d78eaf8b83e2fb7 Mon Sep 17 00:00:00 2001 From: Stefan-gabriel Mirea Date: Fri, 4 Oct 2019 13:51:13 +0000 Subject: tty: serial: Fix PORT_LINFLEXUART definition The port type macros should have different values for different devices. Currently, PORT_LINFLEXUART conflicts with PORT_SUNIX. Fixes: 09864c1cdf5c ("tty: serial: Add linflexuart driver for S32V234") Signed-off-by: Stefan-Gabriel Mirea Link: https://lore.kernel.org/r/20191004135058.18007-1-stefan-gabriel.mirea@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 0f4f87a6fd54..e7fe550b6038 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -291,6 +291,6 @@ #define PORT_SUNIX 121 /* Freescale Linflex UART */ -#define PORT_LINFLEXUART 121 +#define PORT_LINFLEXUART 122 #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit From db9b2e0af605e7c994784527abfd9276cabd718a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Oct 2019 17:44:44 +0100 Subject: rxrpc: Fix rxrpc_recvmsg tracepoint Fix the rxrpc_recvmsg tracepoint to handle being called with a NULL call parameter. Fixes: a25e21f0bcd2 ("rxrpc, afs: Use debug_ids rather than pointers in traces") Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a13a62db3565..edc5c887a44c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1068,7 +1068,7 @@ TRACE_EVENT(rxrpc_recvmsg, ), TP_fast_assign( - __entry->call = call->debug_id; + __entry->call = call ? call->debug_id : 0; __entry->why = why; __entry->seq = seq; __entry->offset = offset; -- cgit From 4cf6c57e61fee954f7b7685de31b80ec26843d27 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:05:58 +0100 Subject: net: phy: fix write to mii-ctrl1000 register When userspace writes to the MII_ADVERTISE register, we update phylib's advertising mask and trigger a renegotiation. However, writing to the MII_CTRL1000 register, which contains the gigabit advertisement, does neither. This can lead to phylib's copy of the advertisement becoming de-synced with the values in the PHY register set, which can result in incorrect negotiation resolution. Fixes: 5502b218e001 ("net: phy: use phy_resolve_aneg_linkmode in genphy_read_status") Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/mii.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/mii.h b/include/linux/mii.h index 5cd824c1c0ca..4ce8901a1af6 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -455,6 +455,15 @@ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, lp_advertising, lpa & LPA_LPACK); } +static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, + u32 ctrl1000) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, + ctrl1000 & ADVERTISE_1000HALF); + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, + ctrl1000 & ADVERTISE_1000FULL); +} + /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising -- cgit From 8d3dc3ac9dd6801c732a72ca6979698c38451b4f Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:04 +0100 Subject: net: phy: extract link partner advertisement reading Move reading the link partner advertisement out of genphy_read_status() into its own separate function. This will allow re-use of this code by PHY drivers that are able to read the resolved status from the PHY. Tested-by: tinywrkb Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index a7ecbe0e55aa..7abee820d05c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1076,6 +1076,7 @@ int genphy_config_eee_advert(struct phy_device *phydev); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); +int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); -- cgit From 2d880b8709c013d47472f85a9d42ea1aca3bce47 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:09 +0100 Subject: net: phy: extract pause mode Extract the update of phylib's software pause mode state from genphy_read_status(), so that we can re-use this functionality with PHYs that have alternative ways to read the negotiation results. Tested-by: tinywrkb Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7abee820d05c..9a0e981df502 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -678,6 +678,7 @@ static inline bool phy_is_started(struct phy_device *phydev) return phydev->state >= PHY_UP; } +void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); /** -- cgit From f1da567f1dc1b55d178b8f2d0cfe8353858aac19 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 5 Oct 2019 23:04:47 +0200 Subject: driver core: platform: Add platform_get_irq_byname_optional() Some drivers (e.g dwc3) first try to get an IRQ byname and then fall back to the one at index 0. In this case we do not want the error(s) printed by platform_get_irq_byname(). This commit adds a new platform_get_irq_byname_optional(), which does not print errors, for this. While at it also improve the kdoc text for platform_get_irq_byname() a bit. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205037 Signed-off-by: Hans de Goede Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20191005210449.3926-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 1b5cec067533..f2688404d1cd 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -64,6 +64,8 @@ extern struct resource *platform_get_resource_byname(struct platform_device *, unsigned int, const char *); extern int platform_get_irq_byname(struct platform_device *, const char *); +extern int platform_get_irq_byname_optional(struct platform_device *dev, + const char *name); extern int platform_add_devices(struct platform_device **, int); struct platform_device_info { -- cgit From 047d50aee341d940350897c85799e56ae57c3849 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Wed, 2 Oct 2019 18:59:00 +0200 Subject: efi/tpm: Don't access event->count when it isn't mapped Some machines generate a lot of event log entries. When we're iterating over them, the code removes the old mapping and adds a new one, so once we cross the page boundary we're unmapping the page with the count on it. Hilarity ensues. This patch keeps the info from the header in local variables so we don't need to access that page again or keep track of if it's mapped. Tested-by: Lyude Paul Signed-off-by: Peter Jones Signed-off-by: Jarkko Sakkinen Signed-off-by: Ard Biesheuvel Reviewed-by: Jarkko Sakkinen Acked-by: Matthew Garrett Acked-by: Ard Biesheuvel Cc: Ben Dooks Cc: Dave Young Cc: Jerry Snitselaar Cc: Linus Torvalds Cc: Lukas Wunner Cc: Octavian Purdila Cc: Peter Zijlstra Cc: Scott Talbert Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-integrity@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations") Link: https://lkml.kernel.org/r/20191002165904.8819-4-ard.biesheuvel@linaro.org [ Minor edits. ] Signed-off-by: Ingo Molnar --- include/linux/tpm_eventlog.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 63238c84dc0b..b50cc3adca18 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -170,6 +170,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, u16 halg; int i; int j; + u32 count, event_type; marker = event; marker_start = marker; @@ -190,16 +191,22 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, } event = (struct tcg_pcr_event2_head *)mapping; + /* + * The loop below will unmap these fields if the log is larger than + * one page, so save them here for reference: + */ + count = READ_ONCE(event->count); + event_type = READ_ONCE(event->event_type); efispecid = (struct tcg_efi_specid_event_head *)event_header->event; /* Check if event is malformed. */ - if (event->count > efispecid->num_algs) { + if (count > efispecid->num_algs) { size = 0; goto out; } - for (i = 0; i < event->count; i++) { + for (i = 0; i < count; i++) { halg_size = sizeof(event->digests[i].alg_id); /* Map the digest's algorithm identifier */ @@ -256,8 +263,9 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, + event_field->event_size; size = marker - marker_start; - if ((event->event_type == 0) && (event_field->event_size == 0)) + if (event_type == 0 && event_field->event_size == 0) size = 0; + out: if (do_mapping) TPM_MEMUNMAP(mapping, mapping_size); -- cgit From e658c82be5561412c5e83b5e74e9da4830593f3e Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Wed, 2 Oct 2019 18:59:02 +0200 Subject: efi/tpm: Only set 'efi_tpm_final_log_size' after successful event log parsing If __calc_tpm2_event_size() fails to parse an event it will return 0, resulting tpm2_calc_event_log_size() returning -1. Currently there is no check of this return value, and 'efi_tpm_final_log_size' can end up being set to this negative value resulting in a crash like this one: BUG: unable to handle page fault for address: ffffbc8fc00866ad #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page RIP: 0010:memcpy_erms+0x6/0x10 Call Trace: tpm_read_log_efi() tpm_bios_log_setup() tpm_chip_register() tpm_tis_core_init.cold.9+0x28c/0x466 tpm_tis_plat_probe() platform_drv_probe() ... Also __calc_tpm2_event_size() returns a size of 0 when it fails to parse an event, so update function documentation to reflect this. The root cause of the issue that caused the failure of event parsing in this case is resolved by Peter Jone's patchset dealing with large event logs where crossing over a page boundary causes the page with the event count to be unmapped. Signed-off-by: Jerry Snitselaar Signed-off-by: Ard Biesheuvel Cc: Ben Dooks Cc: Dave Young Cc: Jarkko Sakkinen Cc: Linus Torvalds Cc: Lukas Wunner Cc: Lyude Paul Cc: Matthew Garrett Cc: Octavian Purdila Cc: Peter Jones Cc: Peter Zijlstra Cc: Scott Talbert Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-integrity@vger.kernel.org Cc: stable@vger.kernel.org Fixes: c46f3405692de ("tpm: Reserve the TPM final events table") Link: https://lkml.kernel.org/r/20191002165904.8819-6-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- include/linux/tpm_eventlog.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index b50cc3adca18..131ea1bad458 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -152,7 +152,7 @@ struct tcg_algorithm_info { * total. Once we've done this we know the offset of the data length field, * and can calculate the total size of the event. * - * Return: size of the event on success, <0 on failure + * Return: size of the event on success, 0 on failure */ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, -- cgit From bf70b0503abd19194dba25fe383d143d0229dc6a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 16:58:21 +0900 Subject: module: swap the order of symbol.namespace Currently, EXPORT_SYMBOL_NS(_GPL) constructs the kernel symbol as follows: __ksymtab_SYMBOL.NAMESPACE The sym_extract_namespace() in modpost allocates memory for the part SYMBOL.NAMESPACE when '.' is contained. One problem is that the pointer returned by strdup() is lost because the symbol name will be copied to malloc'ed memory by alloc_symbol(). No one will keep track of the pointer of strdup'ed memory. sym->namespace still points to the NAMESPACE part. So, you can free it with complicated code like this: free(sym->namespace - strlen(sym->name) - 1); It complicates memory free. To fix it elegantly, I swapped the order of the symbol and the namespace as follows: __ksymtab_NAMESPACE.SYMBOL then, simplified sym_extract_namespace() so that it allocates memory only for the NAMESPACE part. I prefer this order because it is intuitive and also matches to major languages. For example, NAMESPACE::NAME in C++, MODULE.NAME in Python. Reviewed-by: Matthias Maennich Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/export.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 95f55b7f83a0..0695d4e847d9 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -52,7 +52,7 @@ extern struct module __this_module; __ADDRESSABLE(sym) \ asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ " .balign 4 \n" \ - "__ksymtab_" #sym NS_SEPARATOR #ns ": \n" \ + "__ksymtab_" #ns NS_SEPARATOR #sym ": \n" \ " .long " #sym "- . \n" \ " .long __kstrtab_" #sym "- . \n" \ " .long __kstrtab_ns_" #sym "- . \n" \ @@ -76,7 +76,7 @@ struct kernel_symbol { #else #define __KSYMTAB_ENTRY_NS(sym, sec, ns) \ static const struct kernel_symbol __ksymtab_##sym##__##ns \ - asm("__ksymtab_" #sym NS_SEPARATOR #ns) \ + asm("__ksymtab_" #ns NS_SEPARATOR #sym) \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ = { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym } -- cgit From fa6643cdc5cd726b10d30eec45ff8dca267de735 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 16:58:23 +0900 Subject: module: rename __kstrtab_ns_* to __kstrtabns_* to avoid symbol conflict The module namespace produces __strtab_ns_ symbols to store namespace strings, but it does not guarantee the name uniqueness. This is a potential problem because we have exported symbols starting with "ns_". For example, kernel/capability.c exports the following symbols: EXPORT_SYMBOL(ns_capable); EXPORT_SYMBOL(capable); Assume a situation where those are converted as follows: EXPORT_SYMBOL_NS(ns_capable, some_namespace); EXPORT_SYMBOL_NS(capable, some_namespace); The former expands to "__kstrtab_ns_capable" and "__kstrtab_ns_ns_capable", and the latter to "__kstrtab_capable" and "__kstrtab_ns_capable". Then, we have the duplicated "__kstrtab_ns_capable". To ensure the uniqueness, rename "__kstrtab_ns_*" to "__kstrtabns_*". Reviewed-by: Matthias Maennich Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/export.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 0695d4e847d9..621158ecd2e2 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -55,7 +55,7 @@ extern struct module __this_module; "__ksymtab_" #ns NS_SEPARATOR #sym ": \n" \ " .long " #sym "- . \n" \ " .long __kstrtab_" #sym "- . \n" \ - " .long __kstrtab_ns_" #sym "- . \n" \ + " .long __kstrtabns_" #sym "- . \n" \ " .previous \n") #define __KSYMTAB_ENTRY(sym, sec) \ @@ -79,7 +79,7 @@ struct kernel_symbol { asm("__ksymtab_" #ns NS_SEPARATOR #sym) \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ - = { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym } + = { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym } #define __KSYMTAB_ENTRY(sym, sec) \ static const struct kernel_symbol __ksymtab_##sym \ @@ -112,7 +112,7 @@ struct kernel_symbol { /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL_NS(sym, sec, ns) \ ___export_symbol_common(sym, sec); \ - static const char __kstrtab_ns_##sym[] \ + static const char __kstrtabns_##sym[] \ __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #ns; \ __KSYMTAB_ENTRY_NS(sym, sec, ns) -- cgit From c512c69187197fe08026cb5bbe7b9709f4f89b73 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 7 Oct 2019 12:56:48 -0700 Subject: uaccess: implement a proper unsafe_copy_to_user() and switch filldir over to it In commit 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") I made filldir() use unsafe_put_user(), which improves code generation on x86 enormously. But because we didn't have a "unsafe_copy_to_user()", the dirent name copy was also done by hand with unsafe_put_user() in a loop, and it turns out that a lot of other architectures didn't like that, because unlike x86, they have various alignment issues. Most non-x86 architectures trap and fix it up, and some (like xtensa) will just fail unaligned put_user() accesses unconditionally. Which makes that "copy using put_user() in a loop" not work for them at all. I could make that code do explicit alignment etc, but the architectures that don't like unaligned accesses also don't really use the fancy "user_access_begin/end()" model, so they might just use the regular old __copy_to_user() interface. So this commit takes that looping implementation, turns it into the x86 version of "unsafe_copy_to_user()", and makes other architectures implement the unsafe copy version as __copy_to_user() (the same way they do for the other unsafe_xyz() accessor functions). Note that it only does this for the copying _to_ user space, and we still don't have a unsafe version of copy_from_user(). That's partly because we have no current users of it, but also partly because the copy_from_user() case is slightly different and cannot efficiently be implemented in terms of a unsafe_get_user() loop (because gcc can't do asm goto with outputs). It would be trivial to do this using "rep movsb", which would work really nicely on newer x86 cores, but really badly on some older ones. Al Viro is looking at cleaning up all our user copy routines to make this all a non-issue, but for now we have this simple-but-stupid version for x86 that works fine for the dirent name copy case because those names are short strings and we simply don't need anything fancier. Fixes: 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") Reported-by: Guenter Roeck Reported-and-tested-by: Tony Luck Cc: Al Viro Cc: Max Filippov Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e47d0522a1f4..d4ee6e942562 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) -#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) +#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) +#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) +#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) +#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif -- cgit From 08d1d0e6d0a00c6e687201774f3bf61177741e80 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 6 Oct 2019 17:58:15 -0700 Subject: memcg: only record foreign writebacks with dirty pages when memcg is not disabled In kdump kernel, memcg usually is disabled with 'cgroup_disable=memory' for saving memory. Now kdump kernel will always panic when dump vmcore to local disk: BUG: kernel NULL pointer dereference, address: 0000000000000ab8 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 598 Comm: makedumpfile Not tainted 5.3.0+ #26 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 10/02/2018 RIP: 0010:mem_cgroup_track_foreign_dirty_slowpath+0x38/0x140 Call Trace: __set_page_dirty+0x52/0xc0 iomap_set_page_dirty+0x50/0x90 iomap_write_end+0x6e/0x270 iomap_write_actor+0xce/0x170 iomap_apply+0xba/0x11e iomap_file_buffered_write+0x62/0x90 xfs_file_buffered_aio_write+0xca/0x320 [xfs] new_sync_write+0x12d/0x1d0 vfs_write+0xa5/0x1a0 ksys_write+0x59/0xd0 do_syscall_64+0x59/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 And this will corrupt the 1st kernel too with 'cgroup_disable=memory'. Via the trace and with debugging, it is pointing to commit 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") which introduced this regression. Disabling memcg causes the null pointer dereference at uninitialized data in function mem_cgroup_track_foreign_dirty_slowpath(). Fix it by returning directly if memcg is disabled, but not trying to record the foreign writebacks with dirty pages. Link: http://lkml.kernel.org/r/20190924141928.GD31919@MiWiFi-R3L-srv Fixes: 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing") Signed-off-by: Baoquan He Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9b60863429cc..98380779f6d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1264,6 +1264,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { + if (mem_cgroup_disabled()) + return; + if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } -- cgit From 9783aa9917f8ae24759e67bf882f1aba32fe4ea1 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:32 -0700 Subject: mm, memcg: proportional memory.{low,min} reclaim cgroup v2 introduces two memory protection thresholds: memory.low (best-effort) and memory.min (hard protection). While they generally do what they say on the tin, there is a limitation in their implementation that makes them difficult to use effectively: that cliff behaviour often manifests when they become eligible for reclaim. This patch implements more intuitive and usable behaviour, where we gradually mount more reclaim pressure as cgroups further and further exceed their protection thresholds. This cliff edge behaviour happens because we only choose whether or not to reclaim based on whether the memcg is within its protection limits (see the use of mem_cgroup_protected in shrink_node), but we don't vary our reclaim behaviour based on this information. Imagine the following timeline, with the numbers the lruvec size in this zone: 1. memory.low=1000000, memory.current=999999. 0 pages may be scanned. 2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned. 3. memory.low=1000000, memory.current=1000001. 1000001* pages may be scanned. (?!) * Of course, we won't usually scan all available pages in the zone even without this patch because of scan control priority, over-reclaim protection, etc. However, as shown by the tests at the end, these techniques don't sufficiently throttle such an extreme change in input, so cliff-like behaviour isn't really averted by their existence alone. Here's an example of how this plays out in practice. At Facebook, we are trying to protect various workloads from "system" software, like configuration management tools, metric collectors, etc (see this[0] case study). In order to find a suitable memory.low value, we start by determining the expected memory range within which the workload will be comfortable operating. This isn't an exact science -- memory usage deemed "comfortable" will vary over time due to user behaviour, differences in composition of work, etc, etc. As such we need to ballpark memory.low, but doing this is currently problematic: 1. If we end up setting it too low for the workload, it won't have *any* effect (see discussion above). The group will receive the full weight of reclaim and won't have any priority while competing with the less important system software, as if we had no memory.low configured at all. 2. Because of this behaviour, we end up erring on the side of setting it too high, such that the comfort range is reliably covered. However, protected memory is completely unavailable to the rest of the system, so we might cause undue memory and IO pressure there when we *know* we have some elasticity in the workload. 3. Even if we get the value totally right, smack in the middle of the comfort zone, we get extreme jumps between no pressure and full pressure that cause unpredictable pressure spikes in the workload due to the current binary reclaim behaviour. With this patch, we can set it to our ballpark estimation without too much worry. Any undesirable behaviour, such as too much or too little reclaim pressure on the workload or system will be proportional to how far our estimation is off. This means we can set memory.low much more conservatively and thus waste less resources *without* the risk of the workload falling off a cliff if we overshoot. As a more abstract technical description, this unintuitive behaviour results in having to give high-priority workloads a large protection buffer on top of their expected usage to function reliably, as otherwise we have abrupt periods of dramatically increased memory pressure which hamper performance. Having to set these thresholds so high wastes resources and generally works against the principle of work conservation. In addition, having proportional memory reclaim behaviour has other benefits. Most notably, before this patch it's basically mandatory to set memory.low to a higher than desirable value because otherwise as soon as you exceed memory.low, all protection is lost, and all pages are eligible to scan again. By contrast, having a gradual ramp in reclaim pressure means that you now still get some protection when thresholds are exceeded, which means that one can now be more comfortable setting memory.low to lower values without worrying that all protection will be lost. This is important because workingset size is really hard to know exactly, especially with variable workloads, so at least getting *some* protection if your workingset size grows larger than you expect increases user confidence in setting memory.low without a huge buffer on top being needed. Thanks a lot to Johannes Weiner and Tejun Heo for their advice and assistance in thinking about how to make this work better. In testing these changes, I intended to verify that: 1. Changes in page scanning become gradual and proportional instead of binary. To test this, I experimented stepping further and further down memory.low protection on a workload that floats around 19G workingset when under memory.low protection, watching page scan rates for the workload cgroup: +------------+-----------------+--------------------+--------------+ | memory.low | test (pgscan/s) | control (pgscan/s) | % of control | +------------+-----------------+--------------------+--------------+ | 21G | 0 | 0 | N/A | | 17G | 867 | 3799 | 23% | | 12G | 1203 | 3543 | 34% | | 8G | 2534 | 3979 | 64% | | 4G | 3980 | 4147 | 96% | | 0 | 3799 | 3980 | 95% | +------------+-----------------+--------------------+--------------+ As you can see, the test kernel (with a kernel containing this patch) ramps up page scanning significantly more gradually than the control kernel (without this patch). 2. More gradual ramp up in reclaim aggression doesn't result in premature OOMs. To test this, I wrote a script that slowly increments the number of pages held by stress(1)'s --vm-keep mode until a production system entered severe overall memory contention. This script runs in a highly protected slice taking up the majority of available system memory. Watching vmstat revealed that page scanning continued essentially nominally between test and control, without causing forward reclaim progress to become arrested. [0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project [akpm@linux-foundation.org: reflow block comments to fit in 80 cols] [chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection] Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 98380779f6d5..fa9ba2edf7e0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,6 +356,14 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return 0; + + return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); +} + enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -537,6 +545,8 @@ void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); +unsigned long mem_cgroup_size(struct mem_cgroup *memcg); + void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -829,6 +839,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + return 0; +} + static inline enum mem_cgroup_protection mem_cgroup_protected( struct mem_cgroup *root, struct mem_cgroup *memcg) { @@ -968,6 +983,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } +static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return 0; +} + static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { -- cgit From 9de7ca46ad2688bd51e80f7119fefa301ad7f3fa Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:35 -0700 Subject: mm, memcg: make memory.emin the baseline for utilisation determination Roman points out that when when we do the low reclaim pass, we scale the reclaim pressure relative to position between 0 and the maximum protection threshold. However, if the maximum protection is based on memory.elow, and memory.emin is above zero, this means we still may get binary behaviour on second-pass low reclaim. This is because we scale starting at 0, not starting at memory.emin, and since we don't scan at all below emin, we end up with cliff behaviour. This should be a fairly uncommon case since usually we don't go into the second pass, but it makes sense to scale our low reclaim pressure starting at emin. You can test this by catting two large sparse files, one in a cgroup with emin set to some moderate size compared to physical RAM, and another cgroup without any emin. In both cgroups, set an elow larger than 50% of physical RAM. The one with emin will have less page scanning, as reclaim pressure is lower. Rebase on top of and apply the same idea as what was applied to handle cgroup_memory=disable properly for the original proportional patch http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name ("mm, memcg: Handle cgroup_disable=memory when getting memcg protection"). Link: http://lkml.kernel.org/r/20190201051810.GA18895@chrisdown.name Signed-off-by: Chris Down Suggested-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa9ba2edf7e0..1cbad1248e5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,12 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - if (mem_cgroup_disabled()) - return 0; + if (mem_cgroup_disabled()) { + *min = 0; + *low = 0; + return; + } - return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); + *min = READ_ONCE(memcg->memory.emin); + *low = READ_ONCE(memcg->memory.elow); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -839,9 +844,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - return 0; + *min = 0; + *low = 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( -- cgit From 1bc63fb1272be0773e925f78c0fbd06c89701d55 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sun, 6 Oct 2019 17:58:38 -0700 Subject: mm, memcg: make scan aggression always exclude protection This patch is an incremental improvement on the existing memory.{low,min} relative reclaim work to base its scan pressure calculations on how much protection is available compared to the current usage, rather than how much the current usage is over some protection threshold. This change doesn't change the experience for the user in the normal case too much. One benefit is that it replaces the (somewhat arbitrary) 100% cutoff with an indefinite slope, which makes it easier to ballpark a memory.low value. As well as this, the old methodology doesn't quite apply generically to machines with varying amounts of physical memory. Let's say we have a top level cgroup, workload.slice, and another top level cgroup, system-management.slice. We want to roughly give 12G to system-management.slice, so on a 32GB machine we set memory.low to 20GB in workload.slice, and on a 64GB machine we set memory.low to 52GB. However, because these are relative amounts to the total machine size, while the amount of memory we want to generally be willing to yield to system.slice is absolute (12G), we end up putting more pressure on system.slice just because we have a larger machine and a larger workload to fill it, which seems fairly unintuitive. With this new behaviour, we don't end up with this unintended side effect. Previously the way that memory.low protection works is that if you are 50% over a certain baseline, you get 50% of your normal scan pressure. This is certainly better than the previous cliff-edge behaviour, but it can be improved even further by always considering memory under the currently enforced protection threshold to be out of bounds. This means that we can set relatively low memory.low thresholds for variable or bursty workloads while still getting a reasonable level of protection, whereas with the previous version we may still trivially hit the 100% clamp. The previous 100% clamp is also somewhat arbitrary, whereas this one is more concretely based on the currently enforced protection threshold, which is likely easier to reason about. There is also a subtle issue with the way that proportional reclaim worked previously -- it promotes having no memory.low, since it makes pressure higher during low reclaim. This happens because we base our scan pressure modulation on how far memory.current is between memory.min and memory.low, but if memory.low is unset, we only use the overage method. In most cromulent configurations, this then means that we end up with *more* pressure than with no memory.low at all when we're in low reclaim, which is not really very usable or expected. With this patch, memory.low and memory.min affect reclaim pressure in a more understandable and composable way. For example, from a user standpoint, "protected" memory now remains untouchable from a reclaim aggression standpoint, and users can also have more confidence that bursty workloads will still receive some amount of guaranteed protection. Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name Signed-off-by: Chris Down Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Dennis Zhou Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1cbad1248e5a..ae703ea3ef48 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,17 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_protection(struct mem_cgroup *memcg, - unsigned long *min, unsigned long *low) +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) { - if (mem_cgroup_disabled()) { - *min = 0; - *low = 0; - return; - } + if (mem_cgroup_disabled()) + return 0; + + if (in_low_reclaim) + return READ_ONCE(memcg->memory.emin); - *min = READ_ONCE(memcg->memory.emin); - *low = READ_ONCE(memcg->memory.elow); + return max(READ_ONCE(memcg->memory.emin), + READ_ONCE(memcg->memory.elow)); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -844,11 +844,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline void mem_cgroup_protection(struct mem_cgroup *memcg, - unsigned long *min, unsigned long *low) +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) { - *min = 0; - *low = 0; + return 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( -- cgit From 59bb47985c1db229ccff8c5deebecd54fc77d2a9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 6 Oct 2019 17:58:45 -0700 Subject: mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two) In most configurations, kmalloc() happens to return naturally aligned (i.e. aligned to the block size itself) blocks for power of two sizes. That means some kmalloc() users might unknowingly rely on that alignment, until stuff breaks when the kernel is built with e.g. CONFIG_SLUB_DEBUG or CONFIG_SLOB, and blocks stop being aligned. Then developers have to devise workaround such as own kmem caches with specified alignment [1], which is not always practical, as recently evidenced in [2]. The topic has been discussed at LSF/MM 2019 [3]. Adding a 'kmalloc_aligned()' variant would not help with code unknowingly relying on the implicit alignment. For slab implementations it would either require creating more kmalloc caches, or allocate a larger size and only give back part of it. That would be wasteful, especially with a generic alignment parameter (in contrast with a fixed alignment to size). Ideally we should provide to mm users what they need without difficult workarounds or own reimplementations, so let's make the kmalloc() alignment to size explicitly guaranteed for power-of-two sizes under all configurations. What this means for the three available allocators? * SLAB object layout happens to be mostly unchanged by the patch. The implicitly provided alignment could be compromised with CONFIG_DEBUG_SLAB due to redzoning, however SLAB disables redzoning for caches with alignment larger than unsigned long long. Practically on at least x86 this includes kmalloc caches as they use cache line alignment, which is larger than that. Still, this patch ensures alignment on all arches and cache sizes. * SLUB layout is also unchanged unless redzoning is enabled through CONFIG_SLUB_DEBUG and boot parameter for the particular kmalloc cache. With this patch, explicit alignment is guaranteed with redzoning as well. This will result in more memory being wasted, but that should be acceptable in a debugging scenario. * SLOB has no implicit alignment so this patch adds it explicitly for kmalloc(). The potential downside is increased fragmentation. While pathological allocation scenarios are certainly possible, in my testing, after booting a x86_64 kernel+userspace with virtme, around 16MB memory was consumed by slab pages both before and after the patch, with difference in the noise. [1] https://lore.kernel.org/linux-btrfs/c3157c8e8e0e7588312b40c853f65c02fe6c957a.1566399731.git.christophe.leroy@c-s.fr/ [2] https://lore.kernel.org/linux-fsdevel/20190225040904.5557-1-ming.lei@redhat.com/ [3] https://lwn.net/Articles/787740/ [akpm@linux-foundation.org: documentation fixlet, per Matthew] Link: http://lkml.kernel.org/r/20190826111627.7505-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Christoph Hellwig Cc: David Sterba Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Ming Lei Cc: Dave Chinner Cc: "Darrick J . Wong" Cc: Christoph Hellwig Cc: James Bottomley Cc: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index ab2b98ad76e1..4d2a2fa55ed5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. * + * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN + * bytes. For @size of power of two bytes, the alignment is also guaranteed + * to be at least to the size. + * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp.h and described at * :ref:`Documentation/core-api/mm-api.rst ` -- cgit From bec500777089b3c96c53681fc0aa6fee59711d4a Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 7 Oct 2019 18:00:02 -0400 Subject: lib/string: Make memzero_explicit() inline instead of external With the use of the barrier implied by barrier_data(), there is no need for memzero_explicit() to be extern. Making it inline saves the overhead of a function call, and allows the code to be reused in arch/*/purgatory without having to duplicate the implementation. Tested-by: Hans de Goede Signed-off-by: Arvind Sankar Reviewed-by: Hans de Goede Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: H . Peter Anvin Cc: Herbert Xu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephan Mueller Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Cc: linux-s390@vger.kernel.org Fixes: 906a4bb97f5d ("crypto: sha256 - Use get/put_unaligned_be32 to get input, memzero_explicit") Link: https://lkml.kernel.org/r/20191007220000.GA408752@rani.riverdale.lan Signed-off-by: Ingo Molnar --- include/linux/string.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index b2f9df7f0761..b6ccdc2c7f02 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -227,7 +227,26 @@ static inline bool strstarts(const char *str, const char *prefix) } size_t memweight(const void *ptr, size_t bytes); -void memzero_explicit(void *s, size_t count); + +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * Note: usually using memset() is just fine (!), but in cases + * where clearing out _local_ data at the end of a scope is + * necessary, memzero_explicit() should be used instead in + * order to prevent the compiler from optimising away zeroing. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +static inline void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + barrier_data(s); +} /** * kbasename - return the last part of a pathname. -- cgit From e3f1271474182682638654021123b94e6ec1626b Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 2 Oct 2019 07:40:42 -0500 Subject: leds: core: Fix leds.h structure documentation Update the leds.h structure documentation to define the correct arguments. Signed-off-by: Dan Murphy Signed-off-by: Jacek Anaszewski --- include/linux/leds.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/leds.h b/include/linux/leds.h index b8df71193329..efb309dba914 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -247,7 +247,7 @@ extern void led_set_brightness(struct led_classdev *led_cdev, /** * led_set_brightness_sync - set LED brightness synchronously * @led_cdev: the LED to set - * @brightness: the brightness to set it to + * @value: the brightness to set it to * * Set an LED's brightness immediately. This function will block * the caller for the time required for accessing device registers, @@ -301,8 +301,7 @@ extern void led_sysfs_enable(struct led_classdev *led_cdev); /** * led_compose_name - compose LED class device name * @dev: LED controller device object - * @child: child fwnode_handle describing a LED or a group of synchronized LEDs; - * it must be provided only for fwnode based LEDs + * @init_data: the LED class device initialization data * @led_classdev_name: composed LED class device name * * Create LED class device name basing on the provided init_data argument. -- cgit From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 2 Oct 2019 10:40:55 -0400 Subject: SUNRPC: fix race to sk_err after xs_error_report Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") there has been a race to the value of the sk_err if both XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set. In that case, we may end up losing the sk_err value that existed when xs_error_report was called. Fix this by reverting to the previous behavior: instead of using SO_ERROR to retrieve the value at a later time (which might also return sk_err_soft), copy the sk_err value onto struct sock_xprt, and use that value to wake pending tasks. Signed-off-by: Benjamin Coddington Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 7638dbe7bc50..a940de03808d 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -61,6 +61,7 @@ struct sock_xprt { struct mutex recv_mutex; struct sockaddr_storage srcaddr; unsigned short srcport; + int xprt_err; /* * UDP socket buffer size parameters -- cgit From 294f69e662d1570703e9b56e95be37a9fd3afba5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Oct 2019 09:46:42 -0700 Subject: compiler_attributes.h: Add 'fallthrough' pseudo keyword for switch/case use Reserve the pseudo keyword 'fallthrough' for the ability to convert the various case block /* fallthrough */ style comments to appear to be an actual reserved word with the same gcc case block missing fallthrough warning capability. All switch/case blocks now should end in one of: break; fallthrough; goto