From 65e68edce0db433aa0c2b26d7dc14fbbbeb89fbb Mon Sep 17 00:00:00 2001
From: Marta Rybczynska <mrybczyn@kalray.eu>
Date: Tue, 24 Sep 2019 15:14:52 +0200
Subject: nvme: allow 64-bit results in passthru commands

It is not possible to get 64-bit results from the passthru commands,
what prevents from getting for the Capabilities (CAP) property value.

As a result, it is not possible to implement IOL's NVMe Conformance
test 4.3 Case 1 for Fabrics targets [1] (page 123).

This issue has been already discussed [2], but without a solution.

This patch solves the problem by adding new ioctls with a new
passthru structure, including 64-bit results. The older ioctls stay
unchanged.

[1] https://www.iol.unh.edu/sites/default/files/testsuites/nvme/UNH-IOL_NVMe_Conformance_Test_Suite_v11.0.pdf
[2] http://lists.infradead.org/pipermail/linux-nvme/2018-June/018791.html

Signed-off-by: Marta Rybczynska <marta.rybczynska@kalray.eu>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/uapi/linux/nvme_ioctl.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index 1c215ea1798e..e168dc59e9a0 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -45,6 +45,27 @@ struct nvme_passthru_cmd {
 	__u32	result;
 };
 
+struct nvme_passthru_cmd64 {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u64	result;
+};
+
 #define nvme_admin_cmd nvme_passthru_cmd
 
 #define NVME_IOCTL_ID		_IO('N', 0x40)
@@ -54,5 +75,7 @@ struct nvme_passthru_cmd {
 #define NVME_IOCTL_RESET	_IO('N', 0x44)
 #define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45)
 #define NVME_IOCTL_RESCAN	_IO('N', 0x46)
+#define NVME_IOCTL_ADMIN64_CMD	_IOWR('N', 0x47, struct nvme_passthru_cmd64)
+#define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
cgit 


From 833b45de69a6016c4b0cebe6765d526a31a81580 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 30 Sep 2019 18:48:44 +0200
Subject: kvm: x86, powerpc: do not allow clearing largepages debugfs entry

The largepages debugfs entry is incremented/decremented as shadow
pages are created or destroyed.  Clearing it will result in an
underflow, which is harmless to KVM but ugly (and could be
misinterpreted by tools that use debugfs information), so make
this particular statistic read-only.

Cc: kvm-ppc@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fcb46b3374c6..719fc3e15ea4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1090,6 +1090,7 @@ enum kvm_stat_kind {
 
 struct kvm_stat_data {
 	int offset;
+	int mode;
 	struct kvm *kvm;
 };
 
@@ -1097,6 +1098,7 @@ struct kvm_stats_debugfs_item {
 	const char *name;
 	int offset;
 	enum kvm_stat_kind kind;
+	int mode;
 };
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
-- 
cgit 


From 61129dd29f7962f278b618a2a3e8fdb986a66dc8 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 17 Sep 2019 09:18:53 +0200
Subject: sched: Add __ASSEMBLY__ guards around struct clone_args

The addition of struct clone_args to uapi/linux/sched.h is not protected
by __ASSEMBLY__ guards, causing a failure to build from source for glibc
on RISC-V. Add the guards to fix this.

Fixes: 7f192e3cd316 ("fork: add clone3")
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Cc: <stable@vger.kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20190917071853.12385-1-seth.forshee@canonical.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/uapi/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..851ff1feadd5 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -33,6 +33,7 @@
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
 
+#ifndef __ASSEMBLY__
 /*
  * Arguments for the clone3 syscall
  */
@@ -46,6 +47,7 @@ struct clone_args {
 	__aligned_u64 stack_size;
 	__aligned_u64 tls;
 };
+#endif
 
 /*
  * Scheduling policies
-- 
cgit 


From f5a1a536fa14895ccff4e94e6a5af90901ce86aa Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 1 Oct 2019 11:10:52 +1000
Subject: lib: introduce copy_struct_from_user() helper

A common pattern for syscall extensions is increasing the size of a
struct passed from userspace, such that the zero-value of the new fields
result in the old kernel behaviour (allowing for a mix of userspace and
kernel vintages to operate on one another in most cases).

While this interface exists for communication in both directions, only
one interface is straightforward to have reasonable semantics for
(userspace passing a struct to the kernel). For kernel returns to
userspace, what the correct semantics are (whether there should be an
error if userspace is unaware of a new extension) is very
syscall-dependent and thus probably cannot be unified between syscalls
(a good example of this problem is [1]).

Previously there was no common lib/ function that implemented
the necessary extension-checking semantics (and different syscalls
implemented them slightly differently or incompletely[2]). Future
patches replace common uses of this pattern to make use of
copy_struct_from_user().

Some in-kernel selftests that insure that the handling of alignment and
various byte patterns are all handled identically to memchr_inv() usage.

[1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
     robustify sched_read_attr() ABI logic and code")

[2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
     similar checks to copy_struct_from_user() while rt_sigprocmask(2)
     always rejects differently-sized struct arguments.

Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20191001011055.19283-2-cyphar@cyphar.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/bitops.h  |  7 +++++
 include/linux/uaccess.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf074bce3eb3..c94a9ff9f082 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -4,6 +4,13 @@
 #include <asm/types.h>
 #include <linux/bits.h>
 
+/* Set bits in the first 'n' bytes when loaded from memory */
+#ifdef __LITTLE_ENDIAN
+#  define aligned_byte_mask(n) ((1UL << 8*(n))-1)
+#else
+#  define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
+#endif
+
 #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 70bbdc38dc37..e47d0522a1f4 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -231,6 +231,76 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
 
 #endif		/* ARCH_HAS_NOCACHE_UACCESS */
 
+extern __must_check int check_zeroed_user(const void __user *from, size_t size);
+
+/**
+ * copy_struct_from_user: copy a struct from userspace
+ * @dst:   Destination address, in kernel space. This buffer must be @ksize
+ *         bytes long.
+ * @ksize: Size of @dst struct.
+ * @src:   Source address, in userspace.
+ * @usize: (Alleged) size of @src struct.
+ *
+ * Copies a struct from userspace to kernel space, in a way that guarantees
+ * backwards-compatibility for struct syscall arguments (as long as future
+ * struct extensions are made such that all new fields are *appended* to the
+ * old struct, and zeroed-out new fields have the same meaning as the old
+ * struct).
+ *
+ * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
+ * The recommended usage is something like the following:
+ *
+ *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
+ *   {
+ *      int err;
+ *      struct foo karg = {};
+ *
+ *      if (usize > PAGE_SIZE)
+ *        return -E2BIG;
+ *      if (usize < FOO_SIZE_VER0)
+ *        return -EINVAL;
+ *
+ *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ *      if (err)
+ *        return err;
+ *
+ *      // ...
+ *   }
+ *
+ * There are three cases to consider:
+ *  * If @usize == @ksize, then it's copied verbatim.
+ *  * If @usize < @ksize, then the userspace has passed an old struct to a
+ *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
+ *    are to be zero-filled.
+ *  * If @usize > @ksize, then the userspace has passed a new struct to an
+ *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
+ *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
+ *
+ * Returns (in all cases, some data may have been copied):
+ *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
+ *  * -EFAULT: access to userspace failed.
+ */
+static __always_inline __must_check int
+copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
+		      size_t usize)
+{
+	size_t size = min(ksize, usize);
+	size_t rest = max(ksize, usize) - size;
+
+	/* Deal with trailing bytes. */
+	if (usize < ksize) {
+		memset(dst + size, 0, rest);
+	} else if (usize > ksize) {
+		int ret = check_zeroed_user(src + size, rest);
+		if (ret <= 0)
+			return ret ?: -E2BIG;
+	}
+	/* Copy the interoperable parts of the struct. */
+	if (copy_from_user(dst, src, size))
+		return -EFAULT;
+	return 0;
+}
+
 /*
  * probe_kernel_read(): safely attempt to read from a location
  * @dst: pointer to the buffer that shall take the data
-- 
cgit 


From f14c234b4bc5184fd40d9a47830e5b32c3b36d49 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 1 Oct 2019 11:10:53 +1000
Subject: clone3: switch to copy_struct_from_user()

Switch clone3() syscall from it's own copying struct clone_args from
userspace to the new dedicated copy_struct_from_user() helper.

The change is very straightforward, and helps unify the syscall
interface for struct-from-userspace syscalls. Additionally, explicitly
define CLONE_ARGS_SIZE_VER0 to match the other users of the
struct-extension pattern.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
[christian.brauner@ubuntu.com: improve commit message]
Link: https://lore.kernel.org/r/20191001011055.19283-3-cyphar@cyphar.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/uapi/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..0945805982b4 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,6 +47,8 @@ struct clone_args {
 	__aligned_u64 tls;
 };
 
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+
 /*
  * Scheduling policies
  */
-- 
cgit 


From 895b5c9f206eb7d25dc1360a8ccfc5958895eb89 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 29 Sep 2019 20:54:03 +0200
Subject: netfilter: drop bridge nf reset from nf_reset

commit 174e23810cd31
("sk_buff: drop all skb extensions on free and skb scrubbing") made napi
recycle always drop skb extensions.  The additional skb_ext_del() that is
performed via nf_reset on napi skb recycle is not needed anymore.

Most nf_reset() calls in the stack are there so queued skb won't block
'rmmod nf_conntrack' indefinitely.

This removes the skb_ext_del from nf_reset, and renames it to a more
fitting nf_reset_ct().

In a few selected places, add a call to skb_ext_reset to make sure that
no active extensions remain.

I am submitting this for "net", because we're still early in the release
cycle.  The patch applies to net-next too, but I think the rename causes
needless divergence between those trees.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7d3b1a513ef..4351577b14d7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4160,15 +4160,12 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
 static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
 #endif /* CONFIG_SKB_EXTENSIONS */
 
-static inline void nf_reset(struct sk_buff *skb)
+static inline void nf_reset_ct(struct sk_buff *skb)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_conntrack_put(skb_nfct(skb));
 	skb->_nfct = 0;
 #endif
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
-#endif
 }
 
 static inline void nf_reset_trace(struct sk_buff *skb)
-- 
cgit 


From 30945d31e5761436d9eba6b8cff468a5f7c9c266 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 24 Sep 2019 14:49:43 +0200
Subject: hwmon: Fix HWMON_P_MIN_ALARM mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both HWMON_P_MIN_ALARM and HWMON_P_MAX_ALARM were using
BIT(hwmon_power_max_alarm).

Fixes: aa7f29b07c870 ("hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm")
CC: <stable@vger.kernel.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20190924124945.491326-2-nuno.sa@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/hwmon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 04c36b7a61dd..72579168189d 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -235,7 +235,7 @@ enum hwmon_power_attributes {
 #define HWMON_P_LABEL			BIT(hwmon_power_label)
 #define HWMON_P_ALARM			BIT(hwmon_power_alarm)
 #define HWMON_P_CAP_ALARM		BIT(hwmon_power_cap_alarm)
-#define HWMON_P_MIN_ALARM		BIT(hwmon_power_max_alarm)
+#define HWMON_P_MIN_ALARM		BIT(hwmon_power_min_alarm)
 #define HWMON_P_MAX_ALARM		BIT(hwmon_power_max_alarm)
 #define HWMON_P_LCRIT_ALARM		BIT(hwmon_power_lcrit_alarm)
 #define HWMON_P_CRIT_ALARM		BIT(hwmon_power_crit_alarm)
-- 
cgit 


From 09515706857a7d5a2ffb5ce6a44c0bc7859a745b Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 1 Oct 2019 10:25:34 +0200
Subject: xen/efi: have a common runtime setup function

Today the EFI runtime functions are setup in architecture specific
code (x86 and arm), with the functions themselves living in drivers/xen
as they are not architecture dependent.

As the setup is exactly the same for arm and x86 move the setup to
drivers/xen, too. This at once removes the need to make the single
functions global visible.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
[boris: "Dropped EXPORT_SYMBOL_GPL(xen_efi_runtime_setup)"]
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/xen-ops.h | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 98b30c1613b2..d89969aa9942 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -212,30 +212,7 @@ int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr,
 
 bool xen_running_on_version_or_later(unsigned int major, unsigned int minor);
 
-efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc);
-efi_status_t xen_efi_set_time(efi_time_t *tm);
-efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
-				     efi_time_t *tm);
-efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm);
-efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor,
-				  u32 *attr, unsigned long *data_size,
-				  void *data);
-efi_status_t xen_efi_get_next_variable(unsigned long *name_size,
-				       efi_char16_t *name, efi_guid_t *vendor);
-efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor,
-				  u32 attr, unsigned long data_size,
-				  void *data);
-efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space,
-					 u64 *remaining_space,
-					 u64 *max_variable_size);
-efi_status_t xen_efi_get_next_high_mono_count(u32 *count);
-efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules,
-				    unsigned long count, unsigned long sg_list);
-efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules,
-					unsigned long count, u64 *max_size,
-					int *reset_type);
-void xen_efi_reset_system(int reset_type, efi_status_t status,
-			  unsigned long data_size, efi_char16_t *data);
+void xen_efi_runtime_setup(void);
 
 
 #ifdef CONFIG_PREEMPT
-- 
cgit 


From 3e8db7e56082156a37b71d7334860c10fcea8025 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Tue, 1 Oct 2019 21:58:19 +0300
Subject: net: dsa: sja1105: Fix sleeping while atomic in .port_hwtstamp_set

Currently this stack trace can be seen with CONFIG_DEBUG_ATOMIC_SLEEP=y:

[   41.568348] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909
[   41.576757] in_atomic(): 1, irqs_disabled(): 0, pid: 208, name: ptp4l
[   41.583212] INFO: lockdep is turned off.
[   41.587123] CPU: 1 PID: 208 Comm: ptp4l Not tainted 5.3.0-rc6-01445-ge950f2d4bc7f-dirty #1827
[   41.599873] [<c0313d7c>] (unwind_backtrace) from [<c030e13c>] (show_stack+0x10/0x14)
[   41.607584] [<c030e13c>] (show_stack) from [<c1212d50>] (dump_stack+0xd4/0x100)
[   41.614863] [<c1212d50>] (dump_stack) from [<c037dfc8>] (___might_sleep+0x1c8/0x2b4)
[   41.622574] [<c037dfc8>] (___might_sleep) from [<c122ea90>] (__mutex_lock+0x48/0xab8)
[   41.630368] [<c122ea90>] (__mutex_lock) from [<c122f51c>] (mutex_lock_nested+0x1c/0x24)
[   41.638340] [<c122f51c>] (mutex_lock_nested) from [<c0c6fe08>] (sja1105_static_config_reload+0x30/0x27c)
[   41.647779] [<c0c6fe08>] (sja1105_static_config_reload) from [<c0c7015c>] (sja1105_hwtstamp_set+0x108/0x1cc)
[   41.657562] [<c0c7015c>] (sja1105_hwtstamp_set) from [<c0feb650>] (dev_ifsioc+0x18c/0x330)
[   41.665788] [<c0feb650>] (dev_ifsioc) from [<c0febbd8>] (dev_ioctl+0x320/0x6e8)
[   41.673064] [<c0febbd8>] (dev_ioctl) from [<c0f8b1f4>] (sock_ioctl+0x334/0x5e8)
[   41.680340] [<c0f8b1f4>] (sock_ioctl) from [<c05404a8>] (do_vfs_ioctl+0xb0/0xa10)
[   41.687789] [<c05404a8>] (do_vfs_ioctl) from [<c0540e3c>] (ksys_ioctl+0x34/0x58)
[   41.695151] [<c0540e3c>] (ksys_ioctl) from [<c0301000>] (ret_fast_syscall+0x0/0x28)
[   41.702768] Exception stack(0xe8495fa8 to 0xe8495ff0)
[   41.707796] 5fa0:                   beff4a8c 00000001 00000011 000089b0 beff4a8c beff4a80
[   41.715933] 5fc0: beff4a8c 00000001 0000000c 00000036 b6fa98c8 004e19c1 00000001 00000000
[   41.724069] 5fe0: 004dcedc beff4a6c 004c0738 b6e7af4c
[   41.729860] BUG: scheduling while atomic: ptp4l/208/0x00000002
[   41.735682] INFO: lockdep is turned off.

Enabling RX timestamping will logically disturb the fastpath (processing
of meta frames). Replace bool hwts_rx_en with a bit that is checked
atomically from the fastpath and temporarily unset from the sleepable
context during a change of the RX timestamping process (a destructive
operation anyways, requires switch reset).
If found unset, the fastpath (net/dsa/tag_sja1105.c) will just drop any
received meta frame and not take the meta_lock at all.

Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 79435cfc20eb..897e799dbcb9 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -31,6 +31,8 @@
 #define SJA1105_META_SMAC			0x222222222222ull
 #define SJA1105_META_DMAC			0x0180C200000Eull
 
+#define SJA1105_HWTS_RX_EN			0
+
 /* Global tagger data: each struct sja1105_port has a reference to
  * the structure defined in struct sja1105_private.
  */
@@ -42,7 +44,7 @@ struct sja1105_tagger_data {
 	 * from taggers running on multiple ports on SMP systems
 	 */
 	spinlock_t meta_lock;
-	bool hwts_rx_en;
+	unsigned long state;
 };
 
 struct sja1105_skb_cb {
-- 
cgit 


From 815fb4c9d7da862a47c9d2a9765a4759826c5783 Mon Sep 17 00:00:00 2001
From: Marek Olšák <marek.olsak@amd.com>
Date: Tue, 24 Sep 2019 17:53:25 -0400
Subject: drm/amdgpu: return tcc_disabled_mask to userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UMDs need this for correct programming of harvested chips.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index c99b4f2482c6..4fe35d600ab8 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -1003,6 +1003,8 @@ struct drm_amdgpu_info_device {
 	__u64 high_va_max;
 	/* gfx10 pa_sc_tile_steering_override */
 	__u32 pa_sc_tile_steering_override;
+	/* disabled TCCs */
+	__u64 tcc_disabled_mask;
 };
 
 struct drm_amdgpu_info_hw_ip {
-- 
cgit 


From 3a4b46c3bc73240b31cd324b3551fc4231d9f1d5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sat, 20 Jul 2019 21:05:25 +0900
Subject: block: pg: add header include guard

Add a header include guard just in case.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/pg.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/pg.h b/include/uapi/linux/pg.h
index 364c350e85cd..62b6f69bd9fb 100644
--- a/include/uapi/linux/pg.h
+++ b/include/uapi/linux/pg.h
@@ -35,6 +35,9 @@
 
 */
 
+#ifndef _UAPI_LINUX_PG_H
+#define _UAPI_LINUX_PG_H
+
 #define PG_MAGIC	'P'
 #define PG_RESET	'Z'
 #define PG_COMMAND	'C'
@@ -61,4 +64,4 @@ struct pg_read_hdr {
 
 };
 
-/* end of pg.h */
+#endif /* _UAPI_LINUX_PG_H */
-- 
cgit 


From 78f6face5af344f12f4bd48b32faa6f499a06f36 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 27 Sep 2019 17:29:05 +0200
Subject: sched: add kernel-doc for struct clone_args

Add kernel-doc for struct clone_args for the clone3() syscall.

Link: https://lore.kernel.org/r/20191001114701.24661-3-christian.brauner@ubuntu.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/uapi/linux/sched.h | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 851ff1feadd5..3d8f03bfd428 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -34,8 +34,30 @@
 #define CLONE_IO		0x80000000	/* Clone io context */
 
 #ifndef __ASSEMBLY__
-/*
- * Arguments for the clone3 syscall
+/**
+ * struct clone_args - arguments for the clone3 syscall
+ * @flags:       Flags for the new process as listed above.
+ *               All flags are valid except for CSIGNAL and
+ *               CLONE_DETACHED.
+ * @pidfd:       If CLONE_PIDFD is set, a pidfd will be
+ *               returned in this argument.
+ * @child_tid:   If CLONE_CHILD_SETTID is set, the TID of the
+ *               child process will be returned in the child's
+ *               memory.
+ * @parent_tid:  If CLONE_PARENT_SETTID is set, the TID of
+ *               the child process will be returned in the
+ *               parent's memory.
+ * @exit_signal: The exit_signal the parent process will be
+ *               sent when the child exits.
+ * @stack:       Specify the location of the stack for the
+ *               child process.
+ * @stack_size:  The size of the stack for the child process.
+ * @tls:         If CLONE_SETTLS is set, the tls descriptor
+ *               is set to tls.
+ *
+ * The structure is versioned by size and thus extensible.
+ * New struct members must go at the end of the struct and
+ * must be properly 64bit aligned.
  */
 struct clone_args {
 	__aligned_u64 flags;
-- 
cgit 


From 47934ef7f1883209120781b59d78eaf8b83e2fb7 Mon Sep 17 00:00:00 2001
From: Stefan-gabriel Mirea <stefan-gabriel.mirea@nxp.com>
Date: Fri, 4 Oct 2019 13:51:13 +0000
Subject: tty: serial: Fix PORT_LINFLEXUART definition

The port type macros should have different values for different devices.
Currently, PORT_LINFLEXUART conflicts with PORT_SUNIX.

Fixes: 09864c1cdf5c ("tty: serial: Add linflexuart driver for S32V234")
Signed-off-by: Stefan-Gabriel Mirea <stefan-gabriel.mirea@nxp.com>
Link: https://lore.kernel.org/r/20191004135058.18007-1-stefan-gabriel.mirea@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 0f4f87a6fd54..e7fe550b6038 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -291,6 +291,6 @@
 #define PORT_SUNIX	121
 
 /* Freescale Linflex UART */
-#define PORT_LINFLEXUART	121
+#define PORT_LINFLEXUART	122
 
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
cgit 


From db9b2e0af605e7c994784527abfd9276cabd718a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 3 Oct 2019 17:44:44 +0100
Subject: rxrpc: Fix rxrpc_recvmsg tracepoint

Fix the rxrpc_recvmsg tracepoint to handle being called with a NULL call
parameter.

Fixes: a25e21f0bcd2 ("rxrpc, afs: Use debug_ids rather than pointers in traces")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/rxrpc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index a13a62db3565..edc5c887a44c 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1068,7 +1068,7 @@ TRACE_EVENT(rxrpc_recvmsg,
 			     ),
 
 	    TP_fast_assign(
-		    __entry->call = call->debug_id;
+		    __entry->call = call ? call->debug_id : 0;
 		    __entry->why = why;
 		    __entry->seq = seq;
 		    __entry->offset = offset;
-- 
cgit 


From 4cf6c57e61fee954f7b7685de31b80ec26843d27 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2019 17:05:58 +0100
Subject: net: phy: fix write to mii-ctrl1000 register

When userspace writes to the MII_ADVERTISE register, we update phylib's
advertising mask and trigger a renegotiation.  However, writing to the
MII_CTRL1000 register, which contains the gigabit advertisement, does
neither.  This can lead to phylib's copy of the advertisement becoming
de-synced with the values in the PHY register set, which can result in
incorrect negotiation resolution.

Fixes: 5502b218e001 ("net: phy: use phy_resolve_aneg_linkmode in genphy_read_status")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 5cd824c1c0ca..4ce8901a1af6 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -455,6 +455,15 @@ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising,
 			 lp_advertising, lpa & LPA_LPACK);
 }
 
+static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising,
+						   u32 ctrl1000)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising,
+			 ctrl1000 & ADVERTISE_1000HALF);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising,
+			 ctrl1000 & ADVERTISE_1000FULL);
+}
+
 /**
  * linkmode_adv_to_lcl_adv_t
  * @advertising:pointer to linkmode advertising
-- 
cgit 


From 8d3dc3ac9dd6801c732a72ca6979698c38451b4f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2019 17:06:04 +0100
Subject: net: phy: extract link partner advertisement reading

Move reading the link partner advertisement out of genphy_read_status()
into its own separate function.  This will allow re-use of this code by
PHY drivers that are able to read the resolved status from the PHY.

Tested-by: tinywrkb <tinywrkb@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index a7ecbe0e55aa..7abee820d05c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1076,6 +1076,7 @@ int genphy_config_eee_advert(struct phy_device *phydev);
 int __genphy_config_aneg(struct phy_device *phydev, bool changed);
 int genphy_aneg_done(struct phy_device *phydev);
 int genphy_update_link(struct phy_device *phydev);
+int genphy_read_lpa(struct phy_device *phydev);
 int genphy_read_status(struct phy_device *phydev);
 int genphy_suspend(struct phy_device *phydev);
 int genphy_resume(struct phy_device *phydev);
-- 
cgit 


From 2d880b8709c013d47472f85a9d42ea1aca3bce47 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2019 17:06:09 +0100
Subject: net: phy: extract pause mode

Extract the update of phylib's software pause mode state from
genphy_read_status(), so that we can re-use this functionality with
PHYs that have alternative ways to read the negotiation results.

Tested-by: tinywrkb <tinywrkb@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7abee820d05c..9a0e981df502 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -678,6 +678,7 @@ static inline bool phy_is_started(struct phy_device *phydev)
 	return phydev->state >= PHY_UP;
 }
 
+void phy_resolve_aneg_pause(struct phy_device *phydev);
 void phy_resolve_aneg_linkmode(struct phy_device *phydev);
 
 /**
-- 
cgit 


From f1da567f1dc1b55d178b8f2d0cfe8353858aac19 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 5 Oct 2019 23:04:47 +0200
Subject: driver core: platform: Add platform_get_irq_byname_optional()

Some drivers (e.g dwc3) first try to get an IRQ byname and then fall
back to the one at index 0. In this case we do not want the error(s)
printed by platform_get_irq_byname(). This commit adds a new
platform_get_irq_byname_optional(), which does not print errors, for this.

While at it also improve the kdoc text for platform_get_irq_byname() a bit.

BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205037
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20191005210449.3926-2-hdegoede@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 1b5cec067533..f2688404d1cd 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -64,6 +64,8 @@ extern struct resource *platform_get_resource_byname(struct platform_device *,
 						     unsigned int,
 						     const char *);
 extern int platform_get_irq_byname(struct platform_device *, const char *);
+extern int platform_get_irq_byname_optional(struct platform_device *dev,
+					    const char *name);
 extern int platform_add_devices(struct platform_device **, int);
 
 struct platform_device_info {
-- 
cgit 


From 047d50aee341d940350897c85799e56ae57c3849 Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Wed, 2 Oct 2019 18:59:00 +0200
Subject: efi/tpm: Don't access event->count when it isn't mapped

Some machines generate a lot of event log entries.  When we're
iterating over them, the code removes the old mapping and adds a
new one, so once we cross the page boundary we're unmapping the page
with the count on it.  Hilarity ensues.

This patch keeps the info from the header in local variables so we don't
need to access that page again or keep track of if it's mapped.

Tested-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Acked-by: Matthew Garrett <mjg59@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: Dave Young <dyoung@redhat.com>
Cc: Jerry Snitselaar <jsnitsel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Octavian Purdila <octavian.purdila@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Talbert <swt@techie.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: linux-integrity@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations")
Link: https://lkml.kernel.org/r/20191002165904.8819-4-ard.biesheuvel@linaro.org
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tpm_eventlog.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 63238c84dc0b..b50cc3adca18 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -170,6 +170,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 	u16 halg;
 	int i;
 	int j;
+	u32 count, event_type;
 
 	marker = event;
 	marker_start = marker;
@@ -190,16 +191,22 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 	}
 
 	event = (struct tcg_pcr_event2_head *)mapping;
+	/*
+	 * The loop below will unmap these fields if the log is larger than
+	 * one page, so save them here for reference:
+	 */
+	count = READ_ONCE(event->count);
+	event_type = READ_ONCE(event->event_type);
 
 	efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
 
 	/* Check if event is malformed. */
-	if (event->count > efispecid->num_algs) {
+	if (count > efispecid->num_algs) {
 		size = 0;
 		goto out;
 	}
 
-	for (i = 0; i < event->count; i++) {
+	for (i = 0; i < count; i++) {
 		halg_size = sizeof(event->digests[i].alg_id);
 
 		/* Map the digest's algorithm identifier */
@@ -256,8 +263,9 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 		+ event_field->event_size;
 	size = marker - marker_start;
 
-	if ((event->event_type == 0) && (event_field->event_size == 0))
+	if (event_type == 0 && event_field->event_size == 0)
 		size = 0;
+
 out:
 	if (do_mapping)
 		TPM_MEMUNMAP(mapping, mapping_size);
-- 
cgit 


From e658c82be5561412c5e83b5e74e9da4830593f3e Mon Sep 17 00:00:00 2001
From: Jerry Snitselaar <jsnitsel@redhat.com>
Date: Wed, 2 Oct 2019 18:59:02 +0200
Subject: efi/tpm: Only set 'efi_tpm_final_log_size' after successful event log
 parsing

If __calc_tpm2_event_size() fails to parse an event it will return 0,
resulting tpm2_calc_event_log_size() returning -1. Currently there is
no check of this return value, and 'efi_tpm_final_log_size' can end up
being set to this negative value resulting in a crash like this one:

  BUG: unable to handle page fault for address: ffffbc8fc00866ad
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page

  RIP: 0010:memcpy_erms+0x6/0x10
  Call Trace:
   tpm_read_log_efi()
   tpm_bios_log_setup()
   tpm_chip_register()
   tpm_tis_core_init.cold.9+0x28c/0x466
   tpm_tis_plat_probe()
   platform_drv_probe()
   ...

Also __calc_tpm2_event_size() returns a size of 0 when it fails
to parse an event, so update function documentation to reflect this.

The root cause of the issue that caused the failure of event parsing
in this case is resolved by Peter Jone's patchset dealing with large
event logs where crossing over a page boundary causes the page with
the event count to be unmapped.

Signed-off-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: Dave Young <dyoung@redhat.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Matthew Garrett <mjg59@google.com>
Cc: Octavian Purdila <octavian.purdila@intel.com>
Cc: Peter Jones <pjones@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Talbert <swt@techie.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: linux-integrity@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: c46f3405692de ("tpm: Reserve the TPM final events table")
Link: https://lkml.kernel.org/r/20191002165904.8819-6-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tpm_eventlog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index b50cc3adca18..131ea1bad458 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -152,7 +152,7 @@ struct tcg_algorithm_info {
  * total. Once we've done this we know the offset of the data length field,
  * and can calculate the total size of the event.
  *
- * Return: size of the event on success, <0 on failure
+ * Return: size of the event on success, 0 on failure
  */
 
 static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
-- 
cgit 


From bf70b0503abd19194dba25fe383d143d0229dc6a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 3 Oct 2019 16:58:21 +0900
Subject: module: swap the order of symbol.namespace

Currently, EXPORT_SYMBOL_NS(_GPL) constructs the kernel symbol as
follows:

  __ksymtab_SYMBOL.NAMESPACE

The sym_extract_namespace() in modpost allocates memory for the part
SYMBOL.NAMESPACE when '.' is contained. One problem is that the pointer
returned by strdup() is lost because the symbol name will be copied to
malloc'ed memory by alloc_symbol(). No one will keep track of the
pointer of strdup'ed memory.

sym->namespace still points to the NAMESPACE part. So, you can free it
with complicated code like this:

   free(sym->namespace - strlen(sym->name) - 1);

It complicates memory free.

To fix it elegantly, I swapped the order of the symbol and the
namespace as follows:

  __ksymtab_NAMESPACE.SYMBOL

then, simplified sym_extract_namespace() so that it allocates memory
only for the NAMESPACE part.

I prefer this order because it is intuitive and also matches to major
languages. For example, NAMESPACE::NAME in C++, MODULE.NAME in Python.

Reviewed-by: Matthias Maennich <maennich@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/export.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/export.h b/include/linux/export.h
index 95f55b7f83a0..0695d4e847d9 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -52,7 +52,7 @@ extern struct module __this_module;
 	__ADDRESSABLE(sym)						\
 	asm("	.section \"___ksymtab" sec "+" #sym "\", \"a\"	\n"	\
 	    "	.balign	4					\n"	\
-	    "__ksymtab_" #sym NS_SEPARATOR #ns ":		\n"	\
+	    "__ksymtab_" #ns NS_SEPARATOR #sym ":		\n"	\
 	    "	.long	" #sym "- .				\n"	\
 	    "	.long	__kstrtab_" #sym "- .			\n"	\
 	    "	.long	__kstrtab_ns_" #sym "- .		\n"	\
@@ -76,7 +76,7 @@ struct kernel_symbol {
 #else
 #define __KSYMTAB_ENTRY_NS(sym, sec, ns)				\
 	static const struct kernel_symbol __ksymtab_##sym##__##ns	\
-	asm("__ksymtab_" #sym NS_SEPARATOR #ns)				\
+	asm("__ksymtab_" #ns NS_SEPARATOR #sym)				\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	__aligned(sizeof(void *))					\
 	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym }
-- 
cgit 


From fa6643cdc5cd726b10d30eec45ff8dca267de735 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 3 Oct 2019 16:58:23 +0900
Subject: module: rename __kstrtab_ns_* to __kstrtabns_* to avoid symbol
 conflict

The module namespace produces __strtab_ns_<sym> symbols to store
namespace strings, but it does not guarantee the name uniqueness.
This is a potential problem because we have exported symbols starting
with "ns_".

For example, kernel/capability.c exports the following symbols:

  EXPORT_SYMBOL(ns_capable);
  EXPORT_SYMBOL(capable);

Assume a situation where those are converted as follows:

  EXPORT_SYMBOL_NS(ns_capable, some_namespace);
  EXPORT_SYMBOL_NS(capable, some_namespace);

The former expands to "__kstrtab_ns_capable" and "__kstrtab_ns_ns_capable",
and the latter to "__kstrtab_capable" and "__kstrtab_ns_capable".
Then, we have the duplicated "__kstrtab_ns_capable".

To ensure the uniqueness, rename "__kstrtab_ns_*" to "__kstrtabns_*".

Reviewed-by: Matthias Maennich <maennich@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/export.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/export.h b/include/linux/export.h
index 0695d4e847d9..621158ecd2e2 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -55,7 +55,7 @@ extern struct module __this_module;
 	    "__ksymtab_" #ns NS_SEPARATOR #sym ":		\n"	\
 	    "	.long	" #sym "- .				\n"	\
 	    "	.long	__kstrtab_" #sym "- .			\n"	\
-	    "	.long	__kstrtab_ns_" #sym "- .		\n"	\
+	    "	.long	__kstrtabns_" #sym "- .			\n"	\
 	    "	.previous					\n")
 
 #define __KSYMTAB_ENTRY(sym, sec)					\
@@ -79,7 +79,7 @@ struct kernel_symbol {
 	asm("__ksymtab_" #ns NS_SEPARATOR #sym)				\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	__aligned(sizeof(void *))					\
-	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym }
+	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym }
 
 #define __KSYMTAB_ENTRY(sym, sec)					\
 	static const struct kernel_symbol __ksymtab_##sym		\
@@ -112,7 +112,7 @@ struct kernel_symbol {
 /* For every exported symbol, place a struct in the __ksymtab section */
 #define ___EXPORT_SYMBOL_NS(sym, sec, ns)				\
 	___export_symbol_common(sym, sec);				\
-	static const char __kstrtab_ns_##sym[]				\
+	static const char __kstrtabns_##sym[]				\
 	__attribute__((section("__ksymtab_strings"), used, aligned(1)))	\
 	= #ns;								\
 	__KSYMTAB_ENTRY_NS(sym, sec, ns)
-- 
cgit 


From c512c69187197fe08026cb5bbe7b9709f4f89b73 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 7 Oct 2019 12:56:48 -0700
Subject: uaccess: implement a proper unsafe_copy_to_user() and switch filldir
 over to it

In commit 9f79b78ef744 ("Convert filldir[64]() from __put_user() to
unsafe_put_user()") I made filldir() use unsafe_put_user(), which
improves code generation on x86 enormously.

But because we didn't have a "unsafe_copy_to_user()", the dirent name
copy was also done by hand with unsafe_put_user() in a loop, and it
turns out that a lot of other architectures didn't like that, because
unlike x86, they have various alignment issues.

Most non-x86 architectures trap and fix it up, and some (like xtensa)
will just fail unaligned put_user() accesses unconditionally.  Which
makes that "copy using put_user() in a loop" not work for them at all.

I could make that code do explicit alignment etc, but the architectures
that don't like unaligned accesses also don't really use the fancy
"user_access_begin/end()" model, so they might just use the regular old
__copy_to_user() interface.

So this commit takes that looping implementation, turns it into the x86
version of "unsafe_copy_to_user()", and makes other architectures
implement the unsafe copy version as __copy_to_user() (the same way they
do for the other unsafe_xyz() accessor functions).

Note that it only does this for the copying _to_ user space, and we
still don't have a unsafe version of copy_from_user().

That's partly because we have no current users of it, but also partly
because the copy_from_user() case is slightly different and cannot
efficiently be implemented in terms of a unsafe_get_user() loop (because
gcc can't do asm goto with outputs).

It would be trivial to do this using "rep movsb", which would work
really nicely on newer x86 cores, but really badly on some older ones.

Al Viro is looking at cleaning up all our user copy routines to make
this all a non-issue, but for now we have this simple-but-stupid version
for x86 that works fine for the dirent name copy case because those
names are short strings and we simply don't need anything fancier.

Fixes: 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Reported-and-tested-by: Tony Luck <tony.luck@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uaccess.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index e47d0522a1f4..d4ee6e942562 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
 #ifndef user_access_begin
 #define user_access_begin(ptr,len) access_ok(ptr, len)
 #define user_access_end() do { } while (0)
-#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
-#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
+#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
+#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
+#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
+#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
 static inline unsigned long user_access_save(void) { return 0UL; }
 static inline void user_access_restore(unsigned long flags) { }
 #endif
-- 
cgit 


From 08d1d0e6d0a00c6e687201774f3bf61177741e80 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 6 Oct 2019 17:58:15 -0700
Subject: memcg: only record foreign writebacks with dirty pages when memcg is
 not disabled

In kdump kernel, memcg usually is disabled with 'cgroup_disable=memory'
for saving memory.  Now kdump kernel will always panic when dump vmcore
to local disk:

  BUG: kernel NULL pointer dereference, address: 0000000000000ab8
  Oops: 0000 [#1] SMP NOPTI
  CPU: 0 PID: 598 Comm: makedumpfile Not tainted 5.3.0+ #26
  Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 10/02/2018
  RIP: 0010:mem_cgroup_track_foreign_dirty_slowpath+0x38/0x140
  Call Trace:
   __set_page_dirty+0x52/0xc0
   iomap_set_page_dirty+0x50/0x90
   iomap_write_end+0x6e/0x270
   iomap_write_actor+0xce/0x170
   iomap_apply+0xba/0x11e
   iomap_file_buffered_write+0x62/0x90
   xfs_file_buffered_aio_write+0xca/0x320 [xfs]
   new_sync_write+0x12d/0x1d0
   vfs_write+0xa5/0x1a0
   ksys_write+0x59/0xd0
   do_syscall_64+0x59/0x1e0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

And this will corrupt the 1st kernel too with 'cgroup_disable=memory'.

Via the trace and with debugging, it is pointing to commit 97b27821b485
("writeback, memcg: Implement foreign dirty flushing") which introduced
this regression.  Disabling memcg causes the null pointer dereference at
uninitialized data in function mem_cgroup_track_foreign_dirty_slowpath().

Fix it by returning directly if memcg is disabled, but not trying to
record the foreign writebacks with dirty pages.

Link: http://lkml.kernel.org/r/20190924141928.GD31919@MiWiFi-R3L-srv
Fixes: 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing")
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b60863429cc..98380779f6d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1264,6 +1264,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
 static inline void mem_cgroup_track_foreign_dirty(struct page *page,
 						  struct bdi_writeback *wb)
 {
+	if (mem_cgroup_disabled())
+		return;
+
 	if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
 		mem_cgroup_track_foreign_dirty_slowpath(page, wb);
 }
-- 
cgit 


From 9783aa9917f8ae24759e67bf882f1aba32fe4ea1 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Sun, 6 Oct 2019 17:58:32 -0700
Subject: mm, memcg: proportional memory.{low,min} reclaim

cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection).  While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim.  This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.

This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information.  Imagine the following
timeline, with the numbers the lruvec size in this zone:

1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
   scanned. (?!)

* Of course, we won't usually scan all available pages in the zone even
  without this patch because of scan control priority, over-reclaim
  protection, etc.  However, as shown by the tests at the end, these
  techniques don't sufficiently throttle such an extreme change in input,
  so cliff-like behaviour isn't really averted by their existence alone.

Here's an example of how this plays out in practice.  At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study).  In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating.  This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc.  As such we need to ballpark memory.low,
but doing this is currently problematic:

1. If we end up setting it too low for the workload, it won't have
   *any* effect (see discussion above).  The group will receive the full
   weight of reclaim and won't have any priority while competing with the
   less important system software, as if we had no memory.low configured
   at all.

2. Because of this behaviour, we end up erring on the side of setting
   it too high, such that the comfort range is reliably covered.  However,
   protected memory is completely unavailable to the rest of the system,
   so we might cause undue memory and IO pressure there when we *know* we
   have some elasticity in the workload.

3. Even if we get the value totally right, smack in the middle of the
   comfort zone, we get extreme jumps between no pressure and full
   pressure that cause unpredictable pressure spikes in the workload due
   to the current binary reclaim behaviour.

With this patch, we can set it to our ballpark estimation without too much
worry.  Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off.  This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.

As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance.  Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits.  Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again.  By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost.  This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.

Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.

In testing these changes, I intended to verify that:

1. Changes in page scanning become gradual and proportional instead of
   binary.

   To test this, I experimented stepping further and further down
   memory.low protection on a workload that floats around 19G workingset
   when under memory.low protection, watching page scan rates for the
   workload cgroup:

   +------------+-----------------+--------------------+--------------+
   | memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
   +------------+-----------------+--------------------+--------------+
   |        21G |               0 |                  0 | N/A          |
   |        17G |             867 |               3799 | 23%          |
   |        12G |            1203 |               3543 | 34%          |
   |         8G |            2534 |               3979 | 64%          |
   |         4G |            3980 |               4147 | 96%          |
   |          0 |            3799 |               3980 | 95%          |
   +------------+-----------------+--------------------+--------------+

   As you can see, the test kernel (with a kernel containing this
   patch) ramps up page scanning significantly more gradually than the
   control kernel (without this patch).

2. More gradual ramp up in reclaim aggression doesn't result in
   premature OOMs.

   To test this, I wrote a script that slowly increments the number of
   pages held by stress(1)'s --vm-keep mode until a production system
   entered severe overall memory contention.  This script runs in a highly
   protected slice taking up the majority of available system memory.
   Watching vmstat revealed that page scanning continued essentially
   nominally between test and control, without causing forward reclaim
   progress to become arrested.

[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project

[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
  Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 98380779f6d5..fa9ba2edf7e0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,6 +356,14 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow));
+}
+
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 						struct mem_cgroup *memcg);
 
@@ -537,6 +545,8 @@ void mem_cgroup_handle_over_high(void);
 
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
 
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
+
 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
 				struct task_struct *p);
 
@@ -829,6 +839,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
 static inline enum mem_cgroup_protection mem_cgroup_protected(
 	struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
@@ -968,6 +983,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 	return 0;
 }
 
+static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
 static inline void
 mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
 {
-- 
cgit 


From 9de7ca46ad2688bd51e80f7119fefa301ad7f3fa Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Sun, 6 Oct 2019 17:58:35 -0700
Subject: mm, memcg: make memory.emin the baseline for utilisation
 determination

Roman points out that when when we do the low reclaim pass, we scale the
reclaim pressure relative to position between 0 and the maximum
protection threshold.

However, if the maximum protection is based on memory.elow, and
memory.emin is above zero, this means we still may get binary behaviour
on second-pass low reclaim.  This is because we scale starting at 0, not
starting at memory.emin, and since we don't scan at all below emin, we
end up with cliff behaviour.

This should be a fairly uncommon case since usually we don't go into the
second pass, but it makes sense to scale our low reclaim pressure
starting at emin.

You can test this by catting two large sparse files, one in a cgroup
with emin set to some moderate size compared to physical RAM, and
another cgroup without any emin.  In both cgroups, set an elow larger
than 50% of physical RAM.  The one with emin will have less page
scanning, as reclaim pressure is lower.

Rebase on top of and apply the same idea as what was applied to handle
cgroup_memory=disable properly for the original proportional patch
http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name ("mm,
memcg: Handle cgroup_disable=memory when getting memcg protection").

Link: http://lkml.kernel.org/r/20190201051810.GA18895@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Suggested-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fa9ba2edf7e0..1cbad1248e5a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,12 +356,17 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
+					 unsigned long *min, unsigned long *low)
 {
-	if (mem_cgroup_disabled())
-		return 0;
+	if (mem_cgroup_disabled()) {
+		*min = 0;
+		*low = 0;
+		return;
+	}
 
-	return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow));
+	*min = READ_ONCE(memcg->memory.emin);
+	*low = READ_ONCE(memcg->memory.elow);
 }
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
@@ -839,9 +844,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
+					 unsigned long *min, unsigned long *low)
 {
-	return 0;
+	*min = 0;
+	*low = 0;
 }
 
 static inline enum mem_cgroup_protection mem_cgroup_protected(
-- 
cgit 


From 1bc63fb1272be0773e925f78c0fbd06c89701d55 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Sun, 6 Oct 2019 17:58:38 -0700
Subject: mm, memcg: make scan aggression always exclude protection

This patch is an incremental improvement on the existing
memory.{low,min} relative reclaim work to base its scan pressure
calculations on how much protection is available compared to the current
usage, rather than how much the current usage is over some protection
threshold.

This change doesn't change the experience for the user in the normal
case too much.  One benefit is that it replaces the (somewhat arbitrary)
100% cutoff with an indefinite slope, which makes it easier to ballpark
a memory.low value.

As well as this, the old methodology doesn't quite apply generically to
machines with varying amounts of physical memory.  Let's say we have a
top level cgroup, workload.slice, and another top level cgroup,
system-management.slice.  We want to roughly give 12G to
system-management.slice, so on a 32GB machine we set memory.low to 20GB
in workload.slice, and on a 64GB machine we set memory.low to 52GB.
However, because these are relative amounts to the total machine size,
while the amount of memory we want to generally be willing to yield to
system.slice is absolute (12G), we end up putting more pressure on
system.slice just because we have a larger machine and a larger workload
to fill it, which seems fairly unintuitive.  With this new behaviour, we
don't end up with this unintended side effect.

Previously the way that memory.low protection works is that if you are
50% over a certain baseline, you get 50% of your normal scan pressure.
This is certainly better than the previous cliff-edge behaviour, but it
can be improved even further by always considering memory under the
currently enforced protection threshold to be out of bounds.  This means
that we can set relatively low memory.low thresholds for variable or
bursty workloads while still getting a reasonable level of protection,
whereas with the previous version we may still trivially hit the 100%
clamp.  The previous 100% clamp is also somewhat arbitrary, whereas this
one is more concretely based on the currently enforced protection
threshold, which is likely easier to reason about.

There is also a subtle issue with the way that proportional reclaim
worked previously -- it promotes having no memory.low, since it makes
pressure higher during low reclaim.  This happens because we base our
scan pressure modulation on how far memory.current is between memory.min
and memory.low, but if memory.low is unset, we only use the overage
method.  In most cromulent configurations, this then means that we end
up with *more* pressure than with no memory.low at all when we're in low
reclaim, which is not really very usable or expected.

With this patch, memory.low and memory.min affect reclaim pressure in a
more understandable and composable way.  For example, from a user
standpoint, "protected" memory now remains untouchable from a reclaim
aggression standpoint, and users can also have more confidence that
bursty workloads will still receive some amount of guaranteed
protection.

Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1cbad1248e5a..ae703ea3ef48 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,17 +356,17 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
-					 unsigned long *min, unsigned long *low)
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+						  bool in_low_reclaim)
 {
-	if (mem_cgroup_disabled()) {
-		*min = 0;
-		*low = 0;
-		return;
-	}
+	if (mem_cgroup_disabled())
+		return 0;
+
+	if (in_low_reclaim)
+		return READ_ONCE(memcg->memory.emin);
 
-	*min = READ_ONCE(memcg->memory.emin);
-	*low = READ_ONCE(memcg->memory.elow);
+	return max(READ_ONCE(memcg->memory.emin),
+		   READ_ONCE(memcg->memory.elow));
 }
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
@@ -844,11 +844,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
-static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
-					 unsigned long *min, unsigned long *low)
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+						  bool in_low_reclaim)
 {
-	*min = 0;
-	*low = 0;
+	return 0;
 }
 
 static inline enum mem_cgroup_protection mem_cgroup_protected(
-- 
cgit 


From 59bb47985c1db229ccff8c5deebecd54fc77d2a9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Sun, 6 Oct 2019 17:58:45 -0700
Subject: mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two)

In most configurations, kmalloc() happens to return naturally aligned
(i.e.  aligned to the block size itself) blocks for power of two sizes.

That means some kmalloc() users might unknowingly rely on that
alignment, until stuff breaks when the kernel is built with e.g.
CONFIG_SLUB_DEBUG or CONFIG_SLOB, and blocks stop being aligned.  Then
developers have to devise workaround such as own kmem caches with
specified alignment [1], which is not always practical, as recently
evidenced in [2].

The topic has been discussed at LSF/MM 2019 [3].  Adding a
'kmalloc_aligned()' variant would not help with code unknowingly relying
on the implicit alignment.  For slab implementations it would either
require creating more kmalloc caches, or allocate a larger size and only
give back part of it.  That would be wasteful, especially with a generic
alignment parameter (in contrast with a fixed alignment to size).

Ideally we should provide to mm users what they need without difficult
workarounds or own reimplementations, so let's make the kmalloc()
alignment to size explicitly guaranteed for power-of-two sizes under all
configurations.  What this means for the three available allocators?

* SLAB object layout happens to be mostly unchanged by the patch.  The
  implicitly provided alignment could be compromised with
  CONFIG_DEBUG_SLAB due to redzoning, however SLAB disables redzoning for
  caches with alignment larger than unsigned long long.  Practically on at
  least x86 this includes kmalloc caches as they use cache line alignment,
  which is larger than that.  Still, this patch ensures alignment on all
  arches and cache sizes.

* SLUB layout is also unchanged unless redzoning is enabled through
  CONFIG_SLUB_DEBUG and boot parameter for the particular kmalloc cache.
  With this patch, explicit alignment is guaranteed with redzoning as
  well.  This will result in more memory being wasted, but that should be
  acceptable in a debugging scenario.

* SLOB has no implicit alignment so this patch adds it explicitly for
  kmalloc().  The potential downside is increased fragmentation.  While
  pathological allocation scenarios are certainly possible, in my testing,
  after booting a x86_64 kernel+userspace with virtme, around 16MB memory
  was consumed by slab pages both before and after the patch, with
  difference in the noise.

[1] https://lore.kernel.org/linux-btrfs/c3157c8e8e0e7588312b40c853f65c02fe6c957a.1566399731.git.christophe.leroy@c-s.fr/
[2] https://lore.kernel.org/linux-fsdevel/20190225040904.5557-1-ming.lei@redhat.com/
[3] https://lwn.net/Articles/787740/

[akpm@linux-foundation.org: documentation fixlet, per Matthew]
Link: http://lkml.kernel.org/r/20190826111627.7505-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: David Sterba <dsterba@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Darrick J . Wong" <darrick.wong@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index ab2b98ad76e1..4d2a2fa55ed5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  * kmalloc is the normal method of allocating memory
  * for objects smaller than page size in the kernel.
  *
+ * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
+ * bytes. For @size of power of two bytes, the alignment is also guaranteed
+ * to be at least to the size.
+ *
  * The @flags argument may be one of the GFP flags defined at
  * include/linux/gfp.h and described at
  * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
-- 
cgit 


From bec500777089b3c96c53681fc0aa6fee59711d4a Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Mon, 7 Oct 2019 18:00:02 -0400
Subject: lib/string: Make memzero_explicit() inline instead of external

With the use of the barrier implied by barrier_data(), there is no need
for memzero_explicit() to be extern. Making it inline saves the overhead
of a function call, and allows the code to be reused in arch/*/purgatory
without having to duplicate the implementation.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H . Peter Anvin <hpa@zytor.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephan Mueller <smueller@chronox.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-crypto@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Fixes: 906a4bb97f5d ("crypto: sha256 - Use get/put_unaligned_be32 to get input, memzero_explicit")
Link: https://lkml.kernel.org/r/20191007220000.GA408752@rani.riverdale.lan
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/string.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index b2f9df7f0761..b6ccdc2c7f02 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -227,7 +227,26 @@ static inline bool strstarts(const char *str, const char *prefix)
 }
 
 size_t memweight(const void *ptr, size_t bytes);
-void memzero_explicit(void *s, size_t count);
+
+/**
+ * memzero_explicit - Fill a region of memory (e.g. sensitive
+ *		      keying data) with 0s.
+ * @s: Pointer to the start of the area.
+ * @count: The size of the area.
+ *
+ * Note: usually using memset() is just fine (!), but in cases
+ * where clearing out _local_ data at the end of a scope is
+ * necessary, memzero_explicit() should be used instead in
+ * order to prevent the compiler from optimising away zeroing.
+ *
+ * memzero_explicit() doesn't need an arch-specific version as
+ * it just invokes the one of memset() implicitly.
+ */
+static inline void memzero_explicit(void *s, size_t count)
+{
+	memset(s, 0, count);
+	barrier_data(s);
+}
 
 /**
  * kbasename - return the last part of a pathname.
-- 
cgit 


From e3f1271474182682638654021123b94e6ec1626b Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Wed, 2 Oct 2019 07:40:42 -0500
Subject: leds: core: Fix leds.h structure documentation

Update the leds.h structure documentation to define the
correct arguments.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 include/linux/leds.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index b8df71193329..efb309dba914 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -247,7 +247,7 @@ extern void led_set_brightness(struct led_classdev *led_cdev,
 /**
  * led_set_brightness_sync - set LED brightness synchronously
  * @led_cdev: the LED to set
- * @brightness: the brightness to set it to
+ * @value: the brightness to set it to
  *
  * Set an LED's brightness immediately. This function will block
  * the caller for the time required for accessing device registers,
@@ -301,8 +301,7 @@ extern void led_sysfs_enable(struct led_classdev *led_cdev);
 /**
  * led_compose_name - compose LED class device name
  * @dev: LED controller device object
- * @child: child fwnode_handle describing a LED or a group of synchronized LEDs;
- *	   it must be provided only for fwnode based LEDs
+ * @init_data: the LED class device initialization data
  * @led_classdev_name: composed LED class device name
  *
  * Create LED class device name basing on the provided init_data argument.
-- 
cgit 


From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 2 Oct 2019 10:40:55 -0400
Subject: SUNRPC: fix race to sk_err after xs_error_report

Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from
softirq context") there has been a race to the value of the sk_err if both
XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set.  In that case,
we may end up losing the sk_err value that existed when xs_error_report was
called.

Fix this by reverting to the previous behavior: instead of using SO_ERROR
to retrieve the value at a later time (which might also return sk_err_soft),
copy the sk_err value onto struct sock_xprt, and use that value to wake
pending tasks.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context")
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtsock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 7638dbe7bc50..a940de03808d 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -61,6 +61,7 @@ struct sock_xprt {
 	struct mutex		recv_mutex;
 	struct sockaddr_storage	srcaddr;
 	unsigned short		srcport;
+	int			xprt_err;
 
 	/*
 	 * UDP socket buffer size parameters
-- 
cgit 


From 294f69e662d1570703e9b56e95be37a9fd3afba5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 5 Oct 2019 09:46:42 -0700
Subject: compiler_attributes.h: Add 'fallthrough' pseudo keyword for
 switch/case use

Reserve the pseudo keyword 'fallthrough' for the ability to convert the
various case block /* fallthrough */ style comments to appear to be an
actual reserved word with the same gcc case block missing fallthrough
warning capability.

All switch/case blocks now should end in one of:

	break;
	fallthrough;
	goto <label>;
	return [expression];
	continue;

In C mode, GCC supports the __fallthrough__ attribute since 7.1,
the same time the warning and the comment parsing were introduced.

fallthrough devolves to an empty "do {} while (0)" if the compiler
version (any version less than gcc 7) does not support the attribute.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler_attributes.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 6b318efd8a74..cdf016596659 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -40,6 +40,7 @@
 # define __GCC4_has_attribute___noclone__             1
 # define __GCC4_has_attribute___nonstring__           0
 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
+# define __GCC4_has_attribute___fallthrough__         0
 #endif
 
 /*
@@ -185,6 +186,22 @@
 # define __noclone
 #endif
 
+/*
+ * Add the pseudo keyword 'fallthrough' so case statement blocks
+ * must end with any of these keywords:
+ *   break;
+ *   fallthrough;
+ *   goto <label>;
+ *   return [expression];
+ *
+ *  gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#Statement-Attributes
+ */
+#if __has_attribute(__fallthrough__)
+# define fallthrough                    __attribute__((__fallthrough__))
+#else
+# define fallthrough                    do {} while (0)  /* fallthrough */
+#endif
+
 /*
  * Note the missing underscores.
  *
-- 
cgit