From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 13 Aug 2022 08:22:25 -0400
Subject: NFS: Fix another fsync() issue after a server reboot

Currently, when the writeback code detects a server reboot, it redirties
any pages that were not committed to disk, and it sets the flag
NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
that dirtied the file. While this allows the file descriptor in question
to redrive its own writes, it violates the fsync() requirement that we
should be synchronising all writes to disk.
While the problem is infrequent, we do see corner cases where an
untimely server reboot causes the fsync() call to abandon its attempt to
sync data to disk and causing data corruption issues due to missed error
conditions or similar.

In order to tighted up the client's ability to deal with this situation
without introducing livelocks, add a counter that records the number of
times pages are redirtied due to a server reboot-like condition, and use
that in fsync() to redrive the sync to disk.

Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b32ed68e7dc4..f08e581f0161 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -182,6 +182,7 @@ struct nfs_inode {
 		/* Regular file */
 		struct {
 			atomic_long_t	nrequests;
+			atomic_long_t	redirtied_pages;
 			struct nfs_mds_commit_info commit_info;
 			struct mutex	commit_mutex;
 		};
-- 
cgit 


From 5f6277a0c15e1ea54b6fd3d78c9fff7bfe42556c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 13 Aug 2022 08:51:45 -0400
Subject: NFS: Cleanup to remove unused flag NFS_CONTEXT_RESEND_WRITES

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f08e581f0161..7931fa472561 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -83,7 +83,6 @@ struct nfs_open_context {
 	fmode_t mode;
 
 	unsigned long flags;
-#define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
 #define NFS_CONTEXT_FILE_OPEN		(4)
-- 
cgit 


From 0a90ed8d0cfa29735a221eba14d9cb6c735d35b6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 1 Aug 2022 14:37:31 +0300
Subject: platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask

On Intel hardware the SLP_TYPx bitfield occupies bits 10-12 as per ACPI
specification (see Table 4.13 "PM1 Control Registers Fixed Hardware
Feature Control Bits" for the details).

Fix the mask and other related definitions accordingly.

Fixes: 93e5eadd1f6e ("x86/platform: New Intel Atom SOC power management controller driver")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220801113734.36131-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/pmc_atom.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index 3edfb6d4e67a..dd81f510e4cf 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -7,6 +7,8 @@
 #ifndef PMC_ATOM_H
 #define PMC_ATOM_H
 
+#include <linux/bits.h>
+
 /* ValleyView Power Control Unit PCI Device ID */
 #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
 /* CherryTrail Power Control Unit PCI Device ID */
@@ -139,9 +141,9 @@
 #define	ACPI_MMIO_REG_LEN	0x100
 
 #define	PM1_CNT			0x4
-#define	SLEEP_TYPE_MASK		0xFFFFECFF
+#define	SLEEP_TYPE_MASK		GENMASK(12, 10)
 #define	SLEEP_TYPE_S5		0x1C00
-#define	SLEEP_ENABLE		0x2000
+#define	SLEEP_ENABLE		BIT(13)
 
 extern int pmc_atom_read(int offset, u32 *value);
 
-- 
cgit 


From be599244865cb788abe2c6f59ccb05d7240448e4 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Tue, 9 Aug 2022 19:36:33 +0200
Subject: cpumask: align signatures of UP implementations

Between the generic version, and their uniprocessor optimised
implementations, the return types of cpumask_any_and_distribute() and
cpumask_any_distribute() are not identical.  Change the UP versions to
'unsigned int', to match the generic versions.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 0d435d0edbcb..d8c2a40f8beb 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -202,12 +202,13 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	return 0;
 }
 
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
-					     const struct cpumask *src2p) {
+static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
+						      const struct cpumask *src2p)
+{
 	return cpumask_first_and(src1p, src2p);
 }
 
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
+static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	return cpumask_first(srcp);
 }
-- 
cgit 


From 2248ccd80124e61c2c84a22b22409bab452e1f0c Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Tue, 9 Aug 2022 19:36:34 +0200
Subject: lib/cpumask: add inline cpumask_next_wrap() for UP

In the uniprocessor case, cpumask_next_wrap() can be simplified, as the
number of valid argument combinations is limited:
    - 'start' can only be 0
    - 'n' can only be -1 or 0

The only valid CPU that can then be returned, if any, will be the first
one set in the provided 'mask'.

For NR_CPUS == 1, include/linux/cpumask.h now provides an inline
definition of cpumask_next_wrap(), which will conflict with the one
provided by lib/cpumask.c.  Make building of lib/cpumask.o again depend
on CONFIG_SMP=y (i.e. NR_CPUS > 1) to avoid the re-definition.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d8c2a40f8beb..bd047864c7ac 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -262,7 +262,26 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
+#if NR_CPUS == 1
+static inline
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+	cpumask_check(start);
+	if (n != -1)
+		cpumask_check(n);
+
+	/*
+	 * Return the first available CPU when wrapping, or when starting before cpu0,
+	 * since there is only one valid option.
+	 */
+	if (wrap && n >= 0)
+		return nr_cpumask_bits;
+
+	return cpumask_first(mask);
+}
+#else
 unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+#endif
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
-- 
cgit 


From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 6 Aug 2022 20:05:09 +0800
Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create()

psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/psi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 89784763d19e..dd74411ac21d 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -27,7 +27,7 @@ void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, size_t nbytes, enum psi_res res);
+			char *buf, enum psi_res res);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
-- 
cgit 


From d7ae5818c3fa3007dee13f9d99832e7f26b8bc44 Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 6 Aug 2022 20:05:10 +0800
Subject: sched/psi: Remove redundant cgroup_psi() when !CONFIG_CGROUPS

cgroup_psi() is only called under CONFIG_CGROUPS.
We don't need cgroup_psi() when !CONFIG_CGROUPS,
so we can remove it in this case.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..ac5d0515680e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -734,11 +734,6 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
 	return NULL;
 }
 
-static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
-{
-	return NULL;
-}
-
 static inline bool cgroup_psi_enabled(void)
 {
 	return false;
-- 
cgit 


From 484b9fa4886bd9377969aad5e9ea17efda4ecda6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 16 Aug 2022 01:36:31 -0400
Subject: virtio: Revert "virtio: add helper virtio_find_vqs_ctx_size()"

This reverts commit fe3dc04e31aa51f91dc7f741a5f76cc4817eb5b4: the
API is now unused and in fact can't be implemented on top of a legacy
device.

Fixes: fe3dc04e31aa ("virtio: add helper virtio_find_vqs_ctx_size()")
Cc: "Xuan Zhuo" <xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220816053602.173815-3-mst@redhat.com>
---
 include/linux/virtio_config.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 6adff09f7170..888f7e96f0c7 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -241,18 +241,6 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 				      ctx, desc);
 }
 
-static inline
-int virtio_find_vqs_ctx_size(struct virtio_device *vdev, u32 nvqs,
-			     struct virtqueue *vqs[],
-			     vq_callback_t *callbacks[],
-			     const char * const names[],
-			     u32 sizes[],
-			     const bool *ctx, struct irq_affinity *desc)
-{
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, sizes,
-				      ctx, desc);
-}
-
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
  * @vdev: the device
-- 
cgit 


From 9993a4f989c7ca5e227329b2878f65d05c9fc20f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 16 Aug 2022 01:36:58 -0400
Subject: virtio: Revert "virtio: find_vqs() add arg sizes"

This reverts commit a10fba0377145fccefea4dc4dd5915b7ed87e546: the
proposed API isn't supported on all transports but no
effort was made to address this.

It might not be hard to fix if we want to: maybe just
rename size to size_hint and make sure legacy
transports ignore the hint.

But it's not sure what the benefit is in any case, so
let's drop it.

Fixes: a10fba037714 ("virtio: find_vqs() add arg sizes")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220816053602.173815-8-mst@redhat.com>
---
 include/linux/virtio_config.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 888f7e96f0c7..36ec7be1f480 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -55,7 +55,6 @@ struct virtio_shm_region {
  *		include a NULL entry for vqs that do not need a callback
  *	names: array of virtqueue names (mainly for debugging)
  *		include a NULL entry for vqs unused by driver
- *	sizes: array of virtqueue sizes
  *	Returns 0 on success or error status
  * @del_vqs: free virtqueues found by find_vqs().
  * @synchronize_cbs: synchronize with the virtqueue callbacks (optional)
@@ -104,9 +103,7 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[],
-			u32 sizes[],
-			const bool *ctx,
+			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	void (*synchronize_cbs)(struct virtio_device *);
@@ -215,7 +212,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	const char *names[] = { n };
 	struct virtqueue *vq;
 	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
-					 NULL, NULL);
+					 NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -227,8 +224,7 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
-				      NULL, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
 }
 
 static inline
@@ -237,8 +233,8 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
-				      ctx, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
+				      desc);
 }
 
 /**
-- 
cgit 


From 5c669c4a4c6aa0489848093c93b8029f5c5c75ec Mon Sep 17 00:00:00 2001
From: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Date: Wed, 10 Aug 2022 11:40:03 +0200
Subject: virtio: kerneldocs fixes and enhancements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix variable names in some kerneldocs, naming in others.
Add kerneldocs for struct vring_desc and vring_interrupt.

Signed-off-by: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Message-Id: <20220810094004.1250-2-ricardo.canuelo@collabora.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 include/linux/virtio.h        | 6 +++---
 include/linux/virtio_config.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index a3f73bb6733e..dcab9c7e8784 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -11,7 +11,7 @@
 #include <linux/gfp.h>
 
 /**
- * virtqueue - a queue to register buffers for sending or receiving.
+ * struct virtqueue - a queue to register buffers for sending or receiving.
  * @list: the chain of virtqueues for this device
  * @callback: the function to call when buffers are consumed (can be NULL).
  * @name: the name of this virtqueue (mainly for debugging)
@@ -97,7 +97,7 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
 
 /**
- * virtio_device - representation of a device using virtio
+ * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
  * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
  * @config_enabled: configuration change reporting enabled
@@ -156,7 +156,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
 	list_for_each_entry(vq, &vdev->vqs, list)
 
 /**
- * virtio_driver - operations for a virtio I/O driver
+ * struct virtio_driver - operations for a virtio I/O driver
  * @driver: underlying device driver (populate name and owner).
  * @id_table: the ids serviced by this driver.
  * @feature_table: an array of feature numbers supported by this driver.
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 36ec7be1f480..4b517649cfe8 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -239,7 +239,7 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
- * @vdev: the device
+ * @dev: the virtio device
  */
 static inline
 void virtio_synchronize_cbs(struct virtio_device *dev)
@@ -258,7 +258,7 @@ void virtio_synchronize_cbs(struct virtio_device *dev)
 
 /**
  * virtio_device_ready - enable vq use in probe function
- * @vdev: the device
+ * @dev: the virtio device
  *
  * Driver must call this to use vqs in the probe function.
  *
@@ -306,7 +306,7 @@ const char *virtio_bus_name(struct virtio_device *vdev)
 /**
  * virtqueue_set_affinity - setting affinity for a virtqueue
  * @vq: the virtqueue
- * @cpu: the cpu no.
+ * @cpu_mask: the cpu mask
  *
  * Pay attention the function are best-effort: the affinity hint may not be set
  * due to config support, irq type and sharing.
-- 
cgit 


From a8239f0342bae5a51acca967ba95b9a8ad56dd62 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 18 Aug 2022 14:35:55 +0800
Subject: blk-mq: remove unused function blk_mq_queue_stopped()

blk_mq_queue_stopped() doesn't have any caller, which was found by
code coverage test, thus remove it.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20220818063555.3741222-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index effee1dc715a..92294a5fb083 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -857,7 +857,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_complete_request(struct request *rq);
 bool blk_mq_complete_request_remote(struct request *rq);
-bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-- 
cgit 


From b5a5b9d5f28d23b84f06b45c61dcad95b07d41bc Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@kernel.org>
Date: Thu, 18 Aug 2022 15:38:58 +0200
Subject: serial: document start_rx member at struct uart_ops

Fix this doc build warning:
	./include/linux/serial_core.h:397: warning: Function parameter or member 'start_rx' not described in 'uart_ops'

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/5d07ae2eec8fbad87e623160f9926b178bef2744.1660829433.git.mchehab@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index aef3145f2032..6e4f4765d209 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -141,6 +141,14 @@ struct gpio_desc;
  *	Locking: none.
  *	Interrupts: caller dependent.
  *
+ * @start_rx: ``void ()(struct uart_port *port)``
+ *
+ *	Start receiving characters.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
  * @stop_rx: ``void ()(struct uart_port *port)``
  *
  *	Stop receiving characters; the @port is in the process of being closed.
-- 
cgit 


From c1e5c2f0cb8a22ec2e14af92afc7006491bebabb Mon Sep 17 00:00:00 2001
From: Pablo Sun <pablo.sun@mediatek.com>
Date: Thu, 4 Aug 2022 11:48:03 +0800
Subject: usb: typec: altmodes/displayport: correct pin assignment for UFP
 receptacles

Fix incorrect pin assignment values when connecting to a monitor with
Type-C receptacle instead of a plug.

According to specification, an UFP_D receptacle's pin assignment
should came from the UFP_D pin assignments field (bit 23:16), while
an UFP_D plug's assignments are described in the DFP_D pin assignments
(bit 15:8) during Mode Discovery.

For example the LG 27 UL850-W is a monitor with Type-C receptacle.
The monitor responds to MODE DISCOVERY command with following
DisplayPort Capability flag:

        dp->alt->vdo=0x140045

The existing logic only take cares of UPF_D plug case,
and would take the bit 15:8 for this 0x140045 case.

This results in an non-existing pin assignment 0x0 in
dp_altmode_configure.

To fix this problem a new set of macros are introduced
to take plug/receptacle differences into consideration.

Fixes: 0e3bb7d6894d ("usb: typec: Add driver for DisplayPort alternate mode")
Cc: stable@vger.kernel.org
Co-developed-by: Pablo Sun <pablo.sun@mediatek.com>
Co-developed-by: Macpaul Lin <macpaul.lin@mediatek.com>
Reviewed-by: Guillaume Ranquet <granquet@baylibre.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Pablo Sun <pablo.sun@mediatek.com>
Signed-off-by: Macpaul Lin <macpaul.lin@mediatek.com>
Link: https://lore.kernel.org/r/20220804034803.19486-1-macpaul.lin@mediatek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_dp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h
index cfb916cccd31..8d09c2f0a9b8 100644
--- a/include/linux/usb/typec_dp.h
+++ b/include/linux/usb/typec_dp.h
@@ -73,6 +73,11 @@ enum {
 #define DP_CAP_USB			BIT(7)
 #define DP_CAP_DFP_D_PIN_ASSIGN(_cap_)	(((_cap_) & GENMASK(15, 8)) >> 8)
 #define DP_CAP_UFP_D_PIN_ASSIGN(_cap_)	(((_cap_) & GENMASK(23, 16)) >> 16)
+/* Get pin assignment taking plug & receptacle into consideration */
+#define DP_CAP_PIN_ASSIGN_UFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \
+			DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_))
+#define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \
+			DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_))
 
 /* DisplayPort Status Update VDO bits */
 #define DP_STATUS_CONNECTION(_status_)	((_status_) & 3)
-- 
cgit 


From bdd1c37a315bc50ab14066c4852bc8dcf070451e Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 16 Aug 2022 20:53:21 +0800
Subject: KVM: Rename KVM_PRIVATE_MEM_SLOTS to KVM_INTERNAL_MEM_SLOTS

KVM_INTERNAL_MEM_SLOTS better reflects the fact those slots are KVM
internally used (invisible to userspace) and avoids confusion to future
private slots that can have different meaning.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Message-Id: <20220816125322.1110439-2-chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c480b1821e1..fd5c3b4715f8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -656,12 +656,12 @@ struct kvm_irq_routing_table {
 };
 #endif
 
-#ifndef KVM_PRIVATE_MEM_SLOTS
-#define KVM_PRIVATE_MEM_SLOTS 0
+#ifndef KVM_INTERNAL_MEM_SLOTS
+#define KVM_INTERNAL_MEM_SLOTS 0
 #endif
 
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
-#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS)
+#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
 #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
-- 
cgit 


From 20ec3ebd707c77fb9b11b37193449193d4649f33 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 16 Aug 2022 20:53:22 +0800
Subject: KVM: Rename mmu_notifier_* to mmu_invalidate_*

The motivation of this renaming is to make these variables and related
helper functions less mmu_notifier bound and can also be used for non
mmu_notifier based page invalidation. mmu_invalidate_* was chosen to
better describe the purpose of 'invalidating' a page that those
variables are used for.

  - mmu_notifier_seq/range_start/range_end are renamed to
    mmu_invalidate_seq/range_start/range_end.

  - mmu_notifier_retry{_hva} helper functions are renamed to
    mmu_invalidate_retry{_hva}.

  - mmu_notifier_count is renamed to mmu_invalidate_in_progress to
    avoid confusion with mn_active_invalidate_count.

  - While here, also update kvm_inc/dec_notifier_count() to
    kvm_mmu_invalidate_begin/end() to match the change for
    mmu_notifier_count.

No functional change intended.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Message-Id: <20220816125322.1110439-3-chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 60 +++++++++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fd5c3b4715f8..f4519d3689e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -765,10 +765,10 @@ struct kvm {
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	struct mmu_notifier mmu_notifier;
-	unsigned long mmu_notifier_seq;
-	long mmu_notifier_count;
-	unsigned long mmu_notifier_range_start;
-	unsigned long mmu_notifier_range_end;
+	unsigned long mmu_invalidate_seq;
+	long mmu_invalidate_in_progress;
+	unsigned long mmu_invalidate_range_start;
+	unsigned long mmu_invalidate_range_end;
 #endif
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
@@ -1357,10 +1357,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end);
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end);
+void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
+			      unsigned long end);
+void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
+			    unsigned long end);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -1907,42 +1907,44 @@ extern const struct kvm_stats_header kvm_vcpu_stats_header;
 extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
+static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
-	if (unlikely(kvm->mmu_notifier_count))
+	if (unlikely(kvm->mmu_invalidate_in_progress))
 		return 1;
 	/*
-	 * Ensure the read of mmu_notifier_count happens before the read
-	 * of mmu_notifier_seq.  This interacts with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end to make sure that the caller
-	 * either sees the old (non-zero) value of mmu_notifier_count or
-	 * the new (incremented) value of mmu_notifier_seq.
-	 * PowerPC Book3s HV KVM calls this under a per-page lock
-	 * rather than under kvm->mmu_lock, for scalability, so
-	 * can't rely on kvm->mmu_lock to keep things ordered.
+	 * Ensure the read of mmu_invalidate_in_progress happens before
+	 * the read of mmu_invalidate_seq.  This interacts with the
+	 * smp_wmb() in mmu_notifier_invalidate_range_end to make sure
+	 * that the caller either sees the old (non-zero) value of
+	 * mmu_invalidate_in_progress or the new (incremented) value of
+	 * mmu_invalidate_seq.
+	 *
+	 * PowerPC Book3s HV KVM calls this under a per-page lock rather
+	 * than under kvm->mmu_lock, for scalability, so can't rely on
+	 * kvm->mmu_lock to keep things ordered.
 	 */
 	smp_rmb();
-	if (kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
 }
 
-static inline int mmu_notifier_retry_hva(struct kvm *kvm,
-					 unsigned long mmu_seq,
-					 unsigned long hva)
+static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
+					   unsigned long mmu_seq,
+					   unsigned long hva)
 {
 	lockdep_assert_held(&kvm->mmu_lock);
 	/*
-	 * If mmu_notifier_count is non-zero, then the range maintained by
-	 * kvm_mmu_notifier_invalidate_range_start contains all addresses that
-	 * might be being invalidated. Note that it may include some false
+	 * If mmu_invalidate_in_progress is non-zero, then the range maintained
+	 * by kvm_mmu_notifier_invalidate_range_start contains all addresses
+	 * that might be being invalidated. Note that it may include some false
 	 * positives, due to shortcuts when handing concurrent invalidations.
 	 */
-	if (unlikely(kvm->mmu_notifier_count) &&
-	    hva >= kvm->mmu_notifier_range_start &&
-	    hva < kvm->mmu_notifier_range_end)
+	if (unlikely(kvm->mmu_invalidate_in_progress) &&
+	    hva >= kvm->mmu_invalidate_range_start &&
+	    hva < kvm->mmu_invalidate_range_end)
 		return 1;
-	if (kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
 }
-- 
cgit 


From a357f7b4583ebf81d19c95aef57497ae81c5f63c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 17 Aug 2022 23:20:08 +0800
Subject: ata: libata: Set __ATA_BASE_SHT max_sectors

Commit 0568e6122574 ("ata: libata-scsi: cap ata_device->max_sectors
according to shost->max_sectors") inadvertently capped the max_sectors
value for some SATA disks to a value which is lower than we would want.

For a device which supports LBA48, we would previously have request queue
max_sectors_kb and max_hw_sectors_kb values of 1280 and 32767 respectively.

For AHCI controllers, the value chosen for shost max sectors comes from
the minimum of the SCSI host default max sectors in
SCSI_DEFAULT_MAX_SECTORS (1024) and the shost DMA device mapping limit.

This means that we would now set the max_sectors_kb and max_hw_sectors_kb
values for a disk which supports LBA48 at 512, ignoring DMA mapping limit.

As report by Oliver at [0], this caused a performance regression.

Fix by picking a large enough max sectors value for ATA host controllers
such that we don't needlessly reduce max_sectors_kb for LBA48 disks.

[0] https://lore.kernel.org/linux-ide/YvsGbidf3na5FpGb@xsang-OptiPlex-9020/T/#m22d9fc5ad15af66066dd9fecf3d50f1b1ef11da3

Fixes: 0568e6122574 ("ata: libata-scsi: cap ata_device->max_sectors according to shost->max_sectors")
Reported-by: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0269ff114f5a..698032e5ef2d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1382,7 +1382,8 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	.proc_name		= drv_name,			\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
 	.bios_param		= ata_std_bios_param,		\
-	.unlock_native_capacity	= ata_scsi_unlock_native_capacity
+	.unlock_native_capacity	= ata_scsi_unlock_native_capacity,\
+	.max_sectors		= ATA_MAX_SECTORS_LBA48
 
 #define ATA_SUBBASE_SHT(drv_name)				\
 	__ATA_BASE_SHT(drv_name),				\
-- 
cgit 


From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Aug 2022 22:56:40 +0200
Subject: mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW

Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know
that FOLL_FORCE can be possibly dangerous, especially if there are races
that can be exploited by user space.

Right now, it would be sufficient to have some code that sets a PTE of a
R/O-mapped shared page dirty, in order for it to erroneously become
writable by FOLL_FORCE.  The implications of setting a write-protected PTE
dirty might not be immediately obvious to everyone.

And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set
pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map
a shmem page R/O while marking the pte dirty.  This can be used by
unprivileged user space to modify tmpfs/shmem file content even if the
user does not have write permissions to the file, and to bypass memfd
write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590).

To fix such security issues for good, the insight is that we really only
need that fancy retry logic (FOLL_COW) for COW mappings that are not
writable (!VM_WRITE).  And in a COW mapping, we really only broke COW if
we have an exclusive anonymous page mapped.  If we have something else
mapped, or the mapped anonymous page might be shared (!PageAnonExclusive),
we have to trigger a write fault to break COW.  If we don't find an
exclusive anonymous page when we retry, we have to trigger COW breaking
once again because something intervened.

Let's move away from this mandatory-retry + dirty handling and rely on our
PageAnonExclusive() flag for making a similar decision, to use the same
COW logic as in other kernel parts here as well.  In case we stumble over
a PTE in a COW mapping that does not map an exclusive anonymous page, COW
was not properly broken and we have to trigger a fake write-fault to break
COW.

Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e
("mm/mprotect: try avoiding write faults for exclusive anonymous pages
when changing protection") and commit 76aefad628aa ("mm/mprotect: fix
soft-dirty check in can_change_pte_writable()"), take care of softdirty
and uffd-wp manually.

For example, a write() via /proc/self/mem to a uffd-wp-protected range has
to fail instead of silently granting write access and bypassing the
userspace fault handler.  Note that FOLL_FORCE is not only used for debug
access, but also triggered by applications without debug intentions, for
example, when pinning pages via RDMA.

This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are
affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR.

Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So
let's just get rid of it.

Thanks to Nadav Amit for pointing out that the pte_dirty() check in
FOLL_FORCE code is problematic and might be exploitable.

Note 1: We don't check for the PTE being dirty because it doesn't matter
	for making a "was COWed" decision anymore, and whoever modifies the
	page has to set the page dirty either way.

Note 2: Kernels before extended uffd-wp support and before
	PageAnonExclusive (< 5.19) can simply revert the problematic
	commit instead and be safe regarding UFFDIO_CONTINUE. A backport to
	v5.19 requires minor adjustments due to lack of
	vma_soft_dirty_enabled().

Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com
Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte")
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: <stable@vger.kernel.org>	[5.16]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bedc449c14d..982f2607180b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
 #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
-#define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
 #define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
-- 
cgit 


From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Sun, 7 Aug 2022 15:44:42 +0000
Subject: mm: add DEVICE_ZONE to FOR_ALL_ZONES

FOR_ALL_ZONES should be consistent with enum zone_type.  Otherwise,
__count_zid_vm_events have the potential to add count to wrong item when
zid is ZONE_DEVICE.

Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 404024486fa5..f3fc36cd2276 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -20,12 +20,19 @@
 #define HIGHMEM_ZONE(xx)
 #endif
 
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##_DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \
+	HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
-		FOR_ALL_ZONES(PGALLOC),
-		FOR_ALL_ZONES(ALLOCSTALL),
-		FOR_ALL_ZONES(PGSCAN_SKIP),
+		FOR_ALL_ZONES(PGALLOC)
+		FOR_ALL_ZONES(ALLOCSTALL)
+		FOR_ALL_ZONES(PGSCAN_SKIP)
 		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
-- 
cgit 


From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 16:13:40 -0400
Subject: mm/uffd: reset write protection when unregister with wp-mode

The motivation of this patch comes from a recent report and patchfix from
David Hildenbrand on hugetlb shared handling of wr-protected page [1].

With the reproducer provided in commit message of [1], one can leverage
the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect
not only the attacker process, but also the whole system.

The lazy-reset mechanism of uffd-wp was used to make unregister faster,
meanwhile it has an assumption that any leftover pgtable entries should
only affect the process on its own, so not only the user should be aware
of anything it does, but also it should not affect outside of the process.

But it seems that this is not true, and it can also be utilized to make
some exploit easier.

So far there's no clue showing that the lazy-reset is important to any
userfaultfd users because normally the unregister will only happen once
for a specific range of memory of the lifecycle of the process.

Considering all above, what this patch proposes is to do explicit pte
resets when unregister an uffd region with wr-protect mode enabled.

It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false)
right before ioctl(UFFDIO_UNREGISTER) for the user.  So potentially it'll
make the unregister slower.  From that pov it's a very slight abi change,
but hopefully nothing should break with this change either.

Regarding to the change itself - core of uffd write [un]protect operation
is moved into a separate function (uffd_wp_range()) and it is reused in
the unregister code path.

Note that the new function will not check for anything, e.g.  ranges or
memory types, because they should have been checked during the previous
UFFDIO_REGISTER or it should have failed already.  It also doesn't check
mmap_changing because we're with mmap write lock held anyway.

I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's
the only issue reported so far and that's the commit David's reproducer
will start working (v5.19+).  But the whole idea actually applies to not
only file memories but also anonymous.  It's just that we don't need to
fix anonymous prior to v5.19- because there's no known way to exploit.

IOW, this patch can also fix the issue reported in [1] as the patch 2 does.

[1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/

Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 732b522bacb7..e1b8a915e9e9 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
+extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+			  unsigned long start, unsigned long len, bool enable_wp);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
-- 
cgit 


From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 10 Aug 2022 21:51:09 -0700
Subject: mm/shmem: fix chattr fsflags support in tmpfs

ext[234] have always allowed unimplemented chattr flags to be set, but
other filesystems have tended to be stricter.  Follow the stricter
approach for tmpfs: I don't want to have to explain why csu attributes
don't actually work, and we won't need to update the chattr(1) manpage;
and it's never wrong to start off strict, relaxing later if persuaded.
Allow only a (append only) i (immutable) A (no atime) and d (no dump).

Although lsattr showed 'A' inherited, the NOATIME behavior was not being
inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME.  Add
shmem_set_inode_flags() to sync the flags, using inode_set_flags() to
avoid that instant of lost immutablility during fileattr_set().

But that change switched generic/079 from passing to failing: because
FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the
INHERITED fsflags: remove them and generic/079 is back to passing.

Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com
Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs")
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Radoslaw Burny <rburny@google.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1b6c4013f691..ff0b990de83d 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -29,15 +29,10 @@ struct shmem_inode_info {
 	struct inode		vfs_inode;
 };
 
-#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
-#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE
-#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE
-
-/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-
-/* Flags that are appropriate for non-directories/regular files. */
-#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define SHMEM_FL_USER_VISIBLE		FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_MODIFIABLE \
+	(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
+#define SHMEM_FL_INHERITED		(FS_NODUMP_FL | FS_NOATIME_FL)
 
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
-- 
cgit 


From d59b73a66e5e0682442b6d7b4965364e57078b80 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Wed, 3 Aug 2022 10:49:23 +0300
Subject: net/mlx5: Avoid false positive lockdep warning by adding
 lock_class_key

Add a lock_class_key per mlx5 device to avoid a false positive
"possible circular locking dependency" warning by lockdep, on flows
which lock more than one mlx5 device, such as adding SF.

kernel log:
 ======================================================
 WARNING: possible circular locking dependency detected
 5.19.0-rc8+ #2 Not tainted
 ------------------------------------------------------
 kworker/u20:0/8 is trying to acquire lock:
 ffff88812dfe0d98 (&dev->intf_state_mutex){+.+.}-{3:3}, at: mlx5_init_one+0x2e/0x490 [mlx5_core]

 but task is already holding lock:
 ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (&(&notifier->n_head)->rwsem){++++}-{3:3}:
        down_write+0x90/0x150
        blocking_notifier_chain_register+0x53/0xa0
        mlx5_sf_table_init+0x369/0x4a0 [mlx5_core]
        mlx5_init_one+0x261/0x490 [mlx5_core]
        probe_one+0x430/0x680 [mlx5_core]
        local_pci_probe+0xd6/0x170
        work_for_cpu_fn+0x4e/0xa0
        process_one_work+0x7c2/0x1340
        worker_thread+0x6f6/0xec0
        kthread+0x28f/0x330
        ret_from_fork+0x1f/0x30

 -> #0 (&dev->intf_state_mutex){+.+.}-{3:3}:
        __lock_acquire+0x2fc7/0x6720
        lock_acquire+0x1c1/0x550
        __mutex_lock+0x12c/0x14b0
        mlx5_init_one+0x2e/0x490 [mlx5_core]
        mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core]
        auxiliary_bus_probe+0x9d/0xe0
        really_probe+0x1e0/0xaa0
        __driver_probe_device+0x219/0x480
        driver_probe_device+0x49/0x130
        __device_attach_driver+0x1b8/0x280
        bus_for_each_drv+0x123/0x1a0
        __device_attach+0x1a3/0x460
        bus_probe_device+0x1a2/0x260
        device_add+0x9b1/0x1b40
        __auxiliary_device_add+0x88/0xc0
        mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core]
        blocking_notifier_call_chain+0xd5/0x130
        mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core]
        process_one_work+0x7c2/0x1340
        worker_thread+0x59d/0xec0
        kthread+0x28f/0x330
        ret_from_fork+0x1f/0x30

  other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&(&notifier->n_head)->rwsem);
                                lock(&dev->intf_state_mutex);
                                lock(&(&notifier->n_head)->rwsem);
   lock(&dev->intf_state_mutex);

  *** DEADLOCK ***

 4 locks held by kworker/u20:0/8:
  #0: ffff888150612938 ((wq_completion)mlx5_events){+.+.}-{0:0}, at: process_one_work+0x6e2/0x1340
  #1: ffff888100cafdb8 ((work_completion)(&work->work)#3){+.+.}-{0:0}, at: process_one_work+0x70f/0x1340
  #2: ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130
  #3: ffff88813682d0e8 (&dev->mutex){....}-{3:3}, at:__device_attach+0x76/0x460

 stack backtrace:
 CPU: 6 PID: 8 Comm: kworker/u20:0 Not tainted 5.19.0-rc8+
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core]
 Call Trace:
  <TASK>
  dump_stack_lvl+0x57/0x7d
  check_noncircular+0x278/0x300
  ? print_circular_bug+0x460/0x460
  ? lock_chain_count+0x20/0x20
  ? register_lock_class+0x1880/0x1880
  __lock_acquire+0x2fc7/0x6720
  ? register_lock_class+0x1880/0x1880
  ? register_lock_class+0x1880/0x1880
  lock_acquire+0x1c1/0x550
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? lockdep_hardirqs_on_prepare+0x400/0x400
  __mutex_lock+0x12c/0x14b0
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? _raw_read_unlock+0x1f/0x30
  ? mutex_lock_io_nested+0x1320/0x1320
  ? __ioremap_caller.constprop.0+0x306/0x490
  ? mlx5_sf_dev_probe+0x269/0x370 [mlx5_core]
  ? iounmap+0x160/0x160
  mlx5_init_one+0x2e/0x490 [mlx5_core]
  mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core]
  ? mlx5_sf_dev_remove+0x130/0x130 [mlx5_core]
  auxiliary_bus_probe+0x9d/0xe0
  really_probe+0x1e0/0xaa0
  __driver_probe_device+0x219/0x480
  ? auxiliary_match_id+0xe9/0x140
  driver_probe_device+0x49/0x130
  __device_attach_driver+0x1b8/0x280
  ? driver_allows_async_probing+0x140/0x140
  bus_for_each_drv+0x123/0x1a0
  ? bus_for_each_dev+0x1a0/0x1a0
  ? lockdep_hardirqs_on_prepare+0x286/0x400
  ? trace_hardirqs_on+0x2d/0x100
  __device_attach+0x1a3/0x460
  ? device_driver_attach+0x1e0/0x1e0
  ? kobject_uevent_env+0x22d/0xf10
  bus_probe_device+0x1a2/0x260
  device_add+0x9b1/0x1b40
  ? dev_set_name+0xab/0xe0
  ? __fw_devlink_link_to_suppliers+0x260/0x260
  ? memset+0x20/0x40
  ? lockdep_init_map_type+0x21a/0x7d0
  __auxiliary_device_add+0x88/0xc0
  ? auxiliary_device_init+0x86/0xa0
  mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core]
  blocking_notifier_call_chain+0xd5/0x130
  mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core]
  ? mlx5_vhca_event_arm+0x100/0x100 [mlx5_core]
  ? lock_downgrade+0x6e0/0x6e0
  ? lockdep_hardirqs_on_prepare+0x286/0x400
  process_one_work+0x7c2/0x1340
  ? lockdep_hardirqs_on_prepare+0x400/0x400
  ? pwq_dec_nr_in_flight+0x230/0x230
  ? rwlock_bug.part.0+0x90/0x90
  worker_thread+0x59d/0xec0
  ? process_one_work+0x1340/0x1340
  kthread+0x28f/0x330
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>

Fixes: 6a3273217469 ("net/mlx5: SF, Port function state change support")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96b16fbe1aa4..7b7ce602c808 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -779,6 +779,7 @@ struct mlx5_core_dev {
 	enum mlx5_device_state	state;
 	/* sync interface state */
 	struct mutex		intf_state_mutex;
+	struct lock_class_key	lock_key;
 	unsigned long		intf_state;
 	struct mlx5_priv	priv;
 	struct mlx5_profile	profile;
-- 
cgit 


From 13a8e0f6b01b14b2e28ba144e112c883f03a3db2 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 19 Aug 2022 15:16:11 -0700
Subject: Revert "driver core: Delete driver_deferred_probe_check_state()"

This reverts commit 9cbffc7a59561be950ecc675d19a3d2b45202b2b.

There are a few more issues to fix that have been reported in the thread
for the original series [1]. We'll need to fix those before this will work.
So, revert it for now.

[1] - https://lore.kernel.org/lkml/20220601070707.3946847-1-saravanak@google.com/

Fixes: 9cbffc7a5956 ("driver core: Delete driver_deferred_probe_check_state()")
Tested-by: Tony Lindgren <tony@atomide.com>
Tested-by: Peng Fan <peng.fan@nxp.com>
Tested-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20220819221616.2107893-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 7acaabde5396..2114d65b862f 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -242,6 +242,7 @@ driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev)
 
 extern int driver_deferred_probe_timeout;
 void driver_deferred_probe_add(struct device *dev);
+int driver_deferred_probe_check_state(struct device *dev);
 void driver_init(void);
 
 /**
-- 
cgit 


From 7997eff82828304b780dc0a39707e1946d6f1ebf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 20 Aug 2022 17:38:37 +0200
Subject: netfilter: ebtables: reject blobs that don't provide all entry points

Harshit Mogalapalli says:
 In ebt_do_table() function dereferencing 'private->hook_entry[hook]'
 can lead to NULL pointer dereference. [..] Kernel panic:

general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f]
[..]
RIP: 0010:ebt_do_table+0x1dc/0x1ce0
Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 5c 16 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b 6c df 08 48 8d 7d 2c 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 88
[..]
Call Trace:
 nf_hook_slow+0xb1/0x170
 __br_forward+0x289/0x730
 maybe_deliver+0x24b/0x380
 br_flood+0xc6/0x390
 br_dev_xmit+0xa2e/0x12c0

For some reason ebtables rejects blobs that provide entry points that are
not supported by the table, but what it should instead reject is the
opposite: blobs that DO NOT provide an entry point supported by the table.

t->valid_hooks is the bitmask of hooks (input, forward ...) that will see
packets.  Providing an entry point that is not support is harmless
(never called/used), but the inverse isn't: it results in a crash
because the ebtables traverser doesn't expect a NULL blob for a location
its receiving packets for.

Instead of fixing all the individual checks, do what iptables is doing and
reject all blobs that differ from the expected hooks.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Reported-by: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter_bridge/ebtables.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index a13296d6c7ce..fd533552a062 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -94,10 +94,6 @@ struct ebt_table {
 	struct ebt_replace_kernel *table;
 	unsigned int valid_hooks;
 	rwlock_t lock;
-	/* e.g. could be the table explicitly only allows certain
-	 * matches, targets, ... 0 == let it in */
-	int (*check)(const struct ebt_table_info *info,
-	   unsigned int valid_hooks);
 	/* the data used by the kernel */
 	struct ebt_table_info *private;
 	struct nf_hook_ops *ops;
-- 
cgit 


From af67508ea6cbf0e4ea27f8120056fa2efce127dd Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 10:46:56 -0700
Subject: net: Fix data-races around sysctl_fb_tunnels_only_for_init_net.

While reading sysctl_fb_tunnels_only_for_init_net, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a3cb93c3dcc..6d3a33fd0cdb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -640,9 +640,14 @@ extern int sysctl_devconf_inherit_init_net;
  */
 static inline bool net_has_fallback_tunnels(const struct net *net)
 {
-	return !IS_ENABLED(CONFIG_SYSCTL) ||
-	       !sysctl_fb_tunnels_only_for_init_net ||
-	       (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1);
+#if IS_ENABLED(CONFIG_SYSCTL)
+	int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);
+
+	return !fb_tunnels_only_for_init_net ||
+		(net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
+#else
+	return true;
+#endif
 }
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
-- 
cgit 


From a5612ca10d1aa05624ebe72633e0c8c792970833 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 10:46:57 -0700
Subject: net: Fix data-races around sysctl_devconf_inherit_init_net.

While reading sysctl_devconf_inherit_init_net, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 856c395cfa63 ("net: introduce a knob to control whether to inherit devconf config")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6d3a33fd0cdb..05d6f3facd5a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -650,6 +650,15 @@ static inline bool net_has_fallback_tunnels(const struct net *net)
 #endif
 }
 
+static inline int net_inherit_devconf(void)
+{
+#if IS_ENABLED(CONFIG_SYSCTL)
+	return READ_ONCE(sysctl_devconf_inherit_init_net);
+#else
+	return 0;
+#endif
+}
+
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
 {
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
-- 
cgit 


From 2a5840124009f133bd09fd855963551fb2cefe22 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 15 Jul 2022 12:16:22 -0700
Subject: lsm,io_uring: add LSM hooks for the new uring_cmd file op

io-uring cmd support was added through ee692a21e9bf ("fs,io_uring:
add infrastructure for uring-cmd"), this extended the struct
file_operations to allow a new command which each subsystem can use
to enable command passthrough. Add an LSM specific for the command
passthrough which enables LSMs to inspect the command details.

This was discussed long ago without no clear pointer for something
conclusive, so this enables LSMs to at least reject this new file
operation.

[0] https://lkml.kernel.org/r/8adf55db-7bab-f59d-d612-ed906b948d19@schaufler-ca.com

Cc: stable@vger.kernel.org
Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd")
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 +
 include/linux/lsm_hooks.h     | 3 +++
 include/linux/security.h      | 5 +++++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 806448173033..60fff133c0b1 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -407,4 +407,5 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
 #ifdef CONFIG_IO_URING
 LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
 LSM_HOOK(int, 0, uring_sqpoll, void)
+LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
 #endif /* CONFIG_IO_URING */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 84a0d7e02176..3aa6030302f5 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1582,6 +1582,9 @@
  *      Check whether the current task is allowed to spawn a io_uring polling
  *      thread (IORING_SETUP_SQPOLL).
  *
+ * @uring_cmd:
+ *      Check whether the file_operations uring_cmd is allowed to run.
+ *
  */
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..7bd0c490703d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2060,6 +2060,7 @@ static inline int security_perf_event_write(struct perf_event *event)
 #ifdef CONFIG_SECURITY
 extern int security_uring_override_creds(const struct cred *new);
 extern int security_uring_sqpoll(void);
+extern int security_uring_cmd(struct io_uring_cmd *ioucmd);
 #else
 static inline int security_uring_override_creds(const struct cred *new)
 {
@@ -2069,6 +2070,10 @@ static inline int security_uring_sqpoll(void)
 {
 	return 0;
 }
+static inline int security_uring_cmd(struct io_uring_cmd *ioucmd)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_IO_URING */
 
-- 
cgit 


From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: wait_on_bit: add an acquire memory barrier

There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).

On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.

Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h      | 1 +
 include/linux/buffer_head.h | 2 +-
 include/linux/wait_bit.h    | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 #define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
 #define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
 #define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr)	bitop(_test_bit_acquire, nr, addr)
 
 /*
  * Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
 	 * make it consistent with folio_test_uptodate
 	 * pairs with smp_mb__before_atomic in set_buffer_uptodate
 	 */
-	return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+	return test_bit_acquire(BH_Uptodate, &bh->b_state);
 }
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
 wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait,
@@ -96,7 +96,7 @@ static inline int
 wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
 		    unsigned long timeout)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit_timeout(word, bit,
 					       bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
 		   unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit, action, mode);
 }
-- 
cgit 


From fcab34b433e2c13e333b2f53c4a8409eadc432c7 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 10 Aug 2022 10:53:59 -0600
Subject: mm: re-allow pinning of zero pfns (again)

The below referenced commit makes the same error as 1c563432588d ("mm: fix
is_pinnable_page against a cma page"), re-interpreting the logic to
exclude pinning of the zero page, which breaks device assignment with
vfio.

To avoid further subtle mistakes, split the logic into discrete tests.

[akpm@linux-foundation.org: simplify comment, per John]
Link: https://lkml.kernel.org/r/166015037385.760108.16881097713975517242.stgit@omen
Link: https://lore.kernel.org/all/165490039431.944052.12458624139225785964.stgit@omen
Fixes: f25cbb7a95a2 ("mm: add zone device coherent type memory support")
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Tested-by: Slawomir Laba <slawomirx.laba@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 982f2607180b..21f8b27bd9fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1544,9 +1544,16 @@ static inline bool is_longterm_pinnable_page(struct page *page)
 	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
 		return false;
 #endif
-	return !(is_device_coherent_page(page) ||
-		 is_zone_movable_page(page) ||
-		 is_zero_pfn(page_to_pfn(page)));
+	/* The zero page may always be pinned */
+	if (is_zero_pfn(page_to_pfn(page)))
+		return true;
+
+	/* Coherent device memory must always allow eviction. */
+	if (is_device_coherent_page(page))
+		return false;
+
+	/* Otherwise, non-movable zone pages can be pinned. */
+	return !is_zone_movable_page(page);
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
-- 
cgit 


From dbb16df6443c59e8a1ef21c2272fcf387d600ddf Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 17 Aug 2022 17:21:39 +0000
Subject: Revert "memcg: cleanup racy sum avoidance code"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 96e51ccf1af33e82f429a0d6baebba29c6448d0f.

Recently we started running the kernel with rstat infrastructure on
production traffic and begin to see negative memcg stats values.
Particularly the 'sock' stat is the one which we observed having negative
value.

$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 18446744073708724224

Re-run after couple of seconds

$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 53248

For now we are only seeing this issue on large machines (256 CPUs) and
only with 'sock' stat.  I think the networking stack increase the stat on
one cpu and decrease it on another cpu much more often.  So, this negative
sock is due to rstat flusher flushing the stats on the CPU that has seen
the decrement of sock but missed the CPU that has increments.  A typical
race condition.

For easy stable backport, revert is the most simple solution.  For long
term solution, I am thinking of two directions.  First is just reduce the
race window by optimizing the rstat flusher.  Second is if the reader sees
a negative stat value, force flush and restart the stat collection.
Basically retry but limited.

Link: https://lkml.kernel.org/r/20220817172139.3141101-1-shakeelb@google.com
Fixes: 96e51ccf1af33e8 ("memcg: cleanup racy sum avoidance code")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: "Michal Koutný" <mkoutny@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: <stable@vger.kernel.org>	[5.15]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d31ce55b1c0..6257867fbf95 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -987,19 +987,30 @@ static inline void mod_memcg_page_state(struct page *page,
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	return READ_ONCE(memcg->vmstats.state[idx]);
+	long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
 }
 
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
+	long x;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	return READ_ONCE(pn->lruvec_stats.state[idx]);
+	x = READ_ONCE(pn->lruvec_stats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
 }
 
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
-- 
cgit 


From dcf8e5633e2e69ad60b730ab5905608b756a032f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 23 Aug 2022 12:59:25 -0700
Subject: tracing: Define the is_signed_type() macro once

There are two definitions of the is_signed_type() macro: one in
<linux/overflow.h> and a second definition in <linux/trace_events.h>.

As suggested by Linus, move the definition of the is_signed_type() macro
into the <linux/compiler.h> header file.  Change the definition of the
is_signed_type() macro to make sure that it does not trigger any sparse
warnings with future versions of sparse for bitwise types.

Link: https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/
Link: https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h     | 6 ++++++
 include/linux/overflow.h     | 1 -
 include/linux/trace_events.h | 2 --
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 01ce94b58b42..7713d7bcdaea 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
+ * bool and also pointer types.
+ */
+#define is_signed_type(type) (((type)(-1)) < (__force type)1)
+
 /*
  * This is needed in functions which generate the stack canary, see
  * arch/x86/kernel/smpboot.c::start_secondary() for an example.
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f1221d11f8e5..0eb3b192f07a 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -30,7 +30,6 @@
  * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
  * credit to Christian Biere.
  */
-#define is_signed_type(type)       (((type)(-1)) < (type)1)
 #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index b18759a673c6..8401dec93c15 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < (type)1)
-
 int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
 int trace_set_clr_event(const char *system, const char *event, int set);
 int trace_array_set_clr_event(struct trace_array *tr, const char *system,
-- 
cgit 


From 9c6d778800b921bde3bff3cff5003d1650f942d1 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 26 Aug 2022 15:31:32 -0400
Subject: USB: core: Prevent nested device-reset calls

Automatic kernel fuzzing revealed a recursive locking violation in
usb-storage:

============================================
WARNING: possible recursive locking detected
5.18.0 #3 Not tainted
--------------------------------------------
kworker/1:3/1205 is trying to acquire lock:
ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at:
usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230

but task is already holding lock:
ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at:
usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230

...

stack backtrace:
CPU: 1 PID: 1205 Comm: kworker/1:3 Not tainted 5.18.0 #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Workqueue: usb_hub_wq hub_event
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_deadlock_bug kernel/locking/lockdep.c:2988 [inline]
check_deadlock kernel/locking/lockdep.c:3031 [inline]
validate_chain kernel/locking/lockdep.c:3816 [inline]
__lock_acquire.cold+0x152/0x3ca kernel/locking/lockdep.c:5053
lock_acquire kernel/locking/lockdep.c:5665 [inline]
lock_acquire+0x1ab/0x520 kernel/locking/lockdep.c:5630
__mutex_lock_common kernel/locking/mutex.c:603 [inline]
__mutex_lock+0x14f/0x1610 kernel/locking/mutex.c:747
usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230
usb_reset_device+0x37d/0x9a0 drivers/usb/core/hub.c:6109
r871xu_dev_remove+0x21a/0x270 drivers/staging/rtl8712/usb_intf.c:622
usb_unbind_interface+0x1bd/0x890 drivers/usb/core/driver.c:458
device_remove drivers/base/dd.c:545 [inline]
device_remove+0x11f/0x170 drivers/base/dd.c:537
__device_release_driver drivers/base/dd.c:1222 [inline]
device_release_driver_internal+0x1a7/0x2f0 drivers/base/dd.c:1248
usb_driver_release_interface+0x102/0x180 drivers/usb/core/driver.c:627
usb_forced_unbind_intf+0x4d/0xa0 drivers/usb/core/driver.c:1118
usb_reset_device+0x39b/0x9a0 drivers/usb/core/hub.c:6114

This turned out not to be an error in usb-storage but rather a nested
device reset attempt.  That is, as the rtl8712 driver was being
unbound from a composite device in preparation for an unrelated USB
reset (that driver does not have pre_reset or post_reset callbacks),
its ->remove routine called usb_reset_device() -- thus nesting one
reset call within another.

Performing a reset as part of disconnect processing is a questionable
practice at best.  However, the bug report points out that the USB
core does not have any protection against nested resets.  Adding a
reset_in_progress flag and testing it will prevent such errors in the
future.

Link: https://lore.kernel.org/all/CAB7eexKUpvX-JNiLzhXBDWgfg2T9e9_0Tw4HQ6keN==voRbP0g@mail.gmail.com/
Cc: stable@vger.kernel.org
Reported-and-tested-by: Rondreis <linhaoguo86@gmail.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/YwkflDxvg0KWqyZK@rowland.harvard.edu
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index f7a9914fc97f..9ff1ad4dfad1 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -575,6 +575,7 @@ struct usb3_lpm_parameters {
  * @devaddr: device address, XHCI: assigned by HW, others: same as devnum
  * @can_submit: URBs may be submitted
  * @persist_enabled:  USB_PERSIST enabled for this device
+ * @reset_in_progress: the device is being reset
  * @have_langid: whether string_langid is valid
  * @authorized: policy has said we can use it;
  *	(user space) policy determines if we authorize this device to be
@@ -662,6 +663,7 @@ struct usb_device {
 
 	unsigned can_submit:1;
 	unsigned persist_enabled:1;
+	unsigned reset_in_progress:1;
 	unsigned have_langid:1;
 	unsigned authorized:1;
 	unsigned authenticated:1;
-- 
cgit 


From ec1bd37123c607ca6485beb4542a792a4db765aa Mon Sep 17 00:00:00 2001
From: Khalid Masum <khalid.masum.92@gmail.com>
Date: Thu, 18 Aug 2022 10:07:38 +0600
Subject: fscache: fix misdocumented parameter

This patch fixes two warnings generated by make docs. The functions
fscache_use_cookie and fscache_unuse_cookie, both have a parameter
named cookie. But they are documented with the name "object" with
unclear description. Which generates the warning when creating docs.

This commit will replace the currently misdocumented parameter names
with the correct ones while adding proper descriptions.

CC: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Khalid Masum <khalid.masum.92@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20220521142446.4746-1-khalid.masum.92@gmail.com/ # v1
Link: https://lore.kernel.org/r/20220818040738.12036-1-khalid.masum.92@gmail.com/ # v2
Link: https://lore.kernel.org/r/880d7d25753fb326ee17ac08005952112fcf9bdb.1657360984.git.mchehab@kernel.org/ # Mauro's version
---
 include/linux/fscache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 720874e6ee94..36e5dd84cf59 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -258,7 +258,7 @@ struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume,
 
 /**
  * fscache_use_cookie - Request usage of cookie attached to an object
- * @object: Object description
+ * @cookie: The cookie representing the cache object
  * @will_modify: If cache is expected to be modified locally
  *
  * Request usage of the cookie attached to an object.  The caller should tell
@@ -274,7 +274,7 @@ static inline void fscache_use_cookie(struct fscache_cookie *cookie,
 
 /**
  * fscache_unuse_cookie - Cease usage of cookie attached to an object
- * @object: Object description
+ * @cookie: The cookie representing the cache object
  * @aux_data: Updated auxiliary data (or NULL)
  * @object_size: Revised size of the object (or NULL)
  *
-- 
cgit 


From 2555283eb40df89945557273121e9393ef9b542b Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 31 Aug 2022 19:06:00 +0200
Subject: mm/rmap: Fix anon_vma->degree ambiguity leading to double-reuse

anon_vma->degree tracks the combined number of child anon_vmas and VMAs
that use the anon_vma as their ->anon_vma.

anon_vma_clone() then assumes that for any anon_vma attached to
src->anon_vma_chain other than src->anon_vma, it is impossible for it to
be a leaf node of the VMA tree, meaning that for such VMAs ->degree is
elevated by 1 because of a child anon_vma, meaning that if ->degree
equals 1 there are no VMAs that use the anon_vma as their ->anon_vma.

This assumption is wrong because the ->degree optimization leads to leaf
nodes being abandoned on anon_vma_clone() - an existing anon_vma is
reused and no new parent-child relationship is created.  So it is
possible to reuse an anon_vma for one VMA while it is still tied to
another VMA.

This is an issue because is_mergeable_anon_vma() and its callers assume
that if two VMAs have the same ->anon_vma, the list of anon_vmas
attached to the VMAs is guaranteed to be the same.  When this assumption
is violated, vma_merge() can merge pages into a VMA that is not attached
to the corresponding anon_vma, leading to dangling page->mapping
pointers that will be dereferenced during rmap walks.

Fix it by separately tracking the number of child anon_vmas and the
number of VMAs using the anon_vma as their ->anon_vma.

Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
Cc: stable@kernel.org
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf80adca980b..b89b4b86951f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -41,12 +41,15 @@ struct anon_vma {
 	atomic_t refcount;
 
 	/*
-	 * Count of child anon_vmas and VMAs which points to this anon_vma.
+	 * Count of child anon_vmas. Equals to the count of all anon_vmas that
+	 * have ->parent pointing to this one, including itself.
 	 *
 	 * This counter is used for making decision about reusing anon_vma
 	 * instead of forking new one. See comments in function anon_vma_clone.
 	 */
-	unsigned degree;
+	unsigned long num_children;
+	/* Count of VMAs whose ->anon_vma pointer points to this object. */
+	unsigned long num_active_vmas;
 
 	struct anon_vma *parent;	/* Parent of this anon_vma */
 
-- 
cgit