From def8574308edbc3bca821fb965e429a2fe5f4971 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 26 Jan 2023 13:51:14 -0800
Subject: dmapool: add alloc/free performance test

Patch series "dmapool enhancements", v4.

Time spent in dma_pool alloc/free increases linearly with the number of
pages backing the pool.  We can reduce this to constant time with minor
changes to how free pages are tracked.


This patch (of 12):

Provide a module that allocates and frees many blocks of various sizes and
report how long it takes.  This is intended to provide a consistent way to
measure how changes to the dma_pool_alloc/free routines affect timing.

Link: https://lkml.kernel.org/r/20230126215125.4069751-1-kbusch@meta.com
Link: https://lkml.kernel.org/r/20230126215125.4069751-2-kbusch@meta.com
Signed-off-by: Keith Busch <kbusch@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Tony Battersby <tonyb@cybernetics.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'mm/Kconfig')

diff --git a/mm/Kconfig b/mm/Kconfig
index 4751031f3f05..ca98b2072df5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1100,6 +1100,15 @@ comment "GUP_TEST needs to have DEBUG_FS enabled"
 config GUP_GET_PXX_LOW_HIGH
 	bool
 
+config DMAPOOL_TEST
+	tristate "Enable a module to run time tests on dma_pool"
+	depends on HAS_DMA
+	help
+	  Provides a test module that will allocate and free many blocks of
+	  various sizes and report how long it takes. This is intended to
+	  provide a consistent way to measure how changes to the
+	  dma_pool_alloc/free routines affect performance.
+
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
-- 
cgit 


From 23baf831a32c04f9a968812511540b1b3e648bf5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Mar 2023 14:31:33 +0300
Subject: mm, treewide: redefine MAX_ORDER sanely

MAX_ORDER currently defined as number of orders page allocator supports:
user can ask buddy allocator for page order between 0 and MAX_ORDER-1.

This definition is counter-intuitive and lead to number of bugs all over
the kernel.

Change the definition of MAX_ORDER to be inclusive: the range of orders
user can ask from buddy allocator is 0..MAX_ORDER now.

[kirill@shutemov.name: fix min() warning]
  Link: https://lkml.kernel.org/r/20230315153800.32wib3n5rickolvh@box
[akpm@linux-foundation.org: fix another min_t warning]
[kirill@shutemov.name: fixups per Zi Yan]
  Link: https://lkml.kernel.org/r/20230316232144.b7ic4cif4kjiabws@box.shutemov.name
[akpm@linux-foundation.org: fix underlining in docs]
  Link: https://lore.kernel.org/oe-kbuild-all/202303191025.VRCTk6mP-lkp@intel.com/
Link: https://lkml.kernel.org/r/20230315113133.11326-11-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Michael Ellerman <mpe@ellerman.id.au>	[powerpc]
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm/Kconfig')

diff --git a/mm/Kconfig b/mm/Kconfig
index ca98b2072df5..969286ab14a1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -346,9 +346,9 @@ config SHUFFLE_PAGE_ALLOCATOR
 	  the presence of a memory-side-cache. There are also incidental
 	  security benefits as it reduces the predictability of page
 	  allocations to compliment SLAB_FREELIST_RANDOM, but the
-	  default granularity of shuffling on the "MAX_ORDER - 1" i.e,
-	  10th order of pages is selected based on cache utilization
-	  benefits on x86.
+	  default granularity of shuffling on the MAX_ORDER i.e, 10th
+	  order of pages is selected based on cache utilization benefits
+	  on x86.
 
 	  While the randomization improves cache utilization it may
 	  negatively impact workloads on platforms without a cache. For
@@ -666,8 +666,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 	  HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
 	  on a platform.
 
-	  Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
-	  clamped down to MAX_ORDER - 1.
+	  Note that the pageblock_order cannot exceed MAX_ORDER and will be
+	  clamped down to MAX_ORDER.
 
 config CONTIG_ALLOC
 	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
-- 
cgit 


From 0b6cc04f3db3604c1485049bc9582523c2b44b75 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:08 -0800
Subject: mm: introduce CONFIG_PER_VMA_LOCK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Per-VMA locks", v4.

LWN article describing the feature: https://lwn.net/Articles/906852/

Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
last year [2], which concluded with suggestion that “a reader/writer
semaphore could be put into the VMA itself; that would have the effect of
using the VMA as a sort of range lock.  There would still be contention at
the VMA level, but it would be an improvement.” This patchset implements
this suggested approach.

When handling page faults we lookup the VMA that contains the faulting
page under RCU protection and try to acquire its lock.  If that fails we
fall back to using mmap_lock, similar to how SPF handled this situation.

One notable way the implementation deviates from the proposal is the way
VMAs are read-locked.  During some of mm updates, multiple VMAs need to be
locked until the end of the update (e.g.  vma_merge, split_vma, etc).
Tracking all the locked VMAs, avoiding recursive locks, figuring out when
it's safe to unlock previously locked VMAs would make the code more
complex.  So, instead of the usual lock/unlock pattern, the proposed
solution marks a VMA as locked and provides an efficient way to:

1. Identify locked VMAs.

2. Unlock all locked VMAs in bulk.

We also postpone unlocking the locked VMAs until the end of the update,
when we do mmap_write_unlock.  Potentially this keeps a VMA locked for
longer than is absolutely necessary but it results in a big reduction of
code complexity.

Read-locking a VMA is done using two sequence numbers - one in the
vm_area_struct and one in the mm_struct.  VMA is considered read-locked
when these sequence numbers are equal.  To read-lock a VMA we set the
sequence number in vm_area_struct to be equal to the sequence number in
mm_struct.  To unlock all VMAs we increment mm_struct's seq number.  This
allows for an efficient way to track locked VMAs and to drop the locks on
all VMAs at the end of the update.

The patchset implements per-VMA locking only for anonymous pages which are
not in swap and avoids userfaultfs as their implementation is more
complex.  Additional support for file-back page faults, swapped and user
pages can be added incrementally.

Performance benchmarks show similar although slightly smaller benefits as
with SPF patchset (~75% of SPF benefits).  Still, with lower complexity
this approach might be more desirable.

Since RFC was posted in September 2022, two separate Google teams outside
of Android evaluated the patchset and confirmed positive results.  Here
are the known usecases when per-VMA locks show benefits:

Android:

Apps with high number of threads (~100) launch times improve by up to 20%.
Each thread mmaps several areas upon startup (Stack and Thread-local
storage (TLS), thread signal stack, indirect ref table), which requires
taking mmap_lock in write mode.  Page faults take mmap_lock in read mode.
During app launch, both thread creation and page faults establishing the
active workinget are happening in parallel and that causes lock contention
between mm writers and readers even if updates and page faults are
happening in different VMAs.  Per-vma locks prevent this contention by
providing more granular lock.

Google Fibers:

We have several dynamically sized thread pools that spawn new threads
under increased load and reduce their number when idling. For example,
Google's in-process scheduling/threading framework, UMCG/Fibers, is backed
by such a thread pool. When idling, only a small number of idle worker
threads are available; when a spike of incoming requests arrive, each
request is handled in its own "fiber", which is a work item posted onto a
UMCG worker thread; quite often these spikes lead to a number of new
threads spawning. Each new thread needs to allocate and register an RSEQ
section on its TLS, then register itself with the kernel as a UMCG worker
thread, and only after that it can be considered by the in-process
UMCG/Fiber scheduler as available to do useful work. In short, during an
incoming workload spike new threads have to be spawned, and they perform
several syscalls (RSEQ registration, UMCG worker registration, memory
allocations) before they can actually start doing useful work. Removing
any bottlenecks on this thread startup path will greatly improve our
services' latencies when faced with request/workload spikes.

At high scale, mmap_lock contention during thread creation and stack page
faults leads to user-visible multi-second serving latencies in a similar
pattern to Android app startup.  Per-VMA locking patchset has been run
successfully in limited experiments with user-facing production workloads.
In these experiments, we observed that the peak thread creation rate was
high enough that thread creation is no longer a bottleneck.

TCP zerocopy receive:

From the point of view of TCP zerocopy receive, the per-vma lock patch is
massively beneficial.

In today's implementation, a process with N threads where N - 1 are
performing zerocopy receive and 1 thread is performing madvise() with the
write lock taken (e.g.  needs to change vm_flags) will result in all N -1
receive threads blocking until the madvise is done.  Conversely, on a busy
process receiving a lot of data, an madvise operation that does need to
take the mmap lock in write mode will need to wait for all of the receives
to be done - a lose:lose proposition.  Per-VMA locking _removes_ by
definition this source of contention entirely.

There are other benefits for receive as well, chiefly a reduction in
cacheline bouncing across receiving threads for locking/unlocking the
single mmap lock.  On an RPC style synthetic workload with 4KB RPCs:

1a) The find+lock+unlock VMA path in the base case, without the
    per-vma lock patchset, is about 0.7% of cycles as measured by perf.

1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5%
    cycles overall - most of this is within the TCP read hotpath (a small
    fraction is 'other' usage in the system).

2a) The find+lock+unlock VMA path, with the per-vma patchset and a
    trivial patch written to take advantage of it in TCP, is about 0.4% of
    cycles (down from 0.7% above)

2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is <
    0.1% cycles and is out of the TCP read hotpath entirely (down from
    0.5% before, the remaining usage is the 'other' usage in the system).
    So, in addition to entirely removing an onerous source of contention,
    it also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+
    (compared to overall cycles in perf) for the 'small' RPC scenario.

In https://lkml.kernel.org/r/87fsaqouyd.fsf_-_@stealth, Punit
demonstrated throughput improvements of as much as 188% from this
patchset.


This patch (of 25):

This configuration variable will be used to build the support for VMA
locking during page fault handling.

This is enabled on supported architectures with SMP and MMU set.

The architecture support is needed since the page fault handler is called
from the architecture's page faulting code which needs modifications to
handle faults under VMA lock.

Link: https://lkml.kernel.org/r/20230227173632.3292573-1-surenb@google.com
Link: https://lkml.kernel.org/r/20230227173632.3292573-10-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'mm/Kconfig')

diff --git a/mm/Kconfig b/mm/Kconfig
index 969286ab14a1..6ee3b48ed298 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1211,6 +1211,18 @@ config LRU_GEN_STATS
 	  This option has a per-memcg and per-node memory overhead.
 # }
 
+config ARCH_SUPPORTS_PER_VMA_LOCK
+       def_bool n
+
+config PER_VMA_LOCK
+	def_bool y
+	depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
+	help
+	  Allow per-vma locking during page fault handling.
+
+	  This feature allows locking each virtual memory area separately when
+	  handling page faults instead of taking mmap_lock.
+
 source "mm/damon/Kconfig"
 
 endmenu
-- 
cgit 


From 0b376f1e0ff555435597fa16823ae0f30b2883e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 12 Apr 2023 10:30:25 +0530
Subject: mm/hugetlb_vmemmap: rename ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP

Now we use ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP config option to
indicate devdax and hugetlb vmemmap optimization support.  Hence rename
that to a generic ARCH_WANT_OPTIMIZE_VMEMMAP

Link: https://lkml.kernel.org/r/20230412050025.84346-2-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tarun Sahu <tsahu@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm/Kconfig')

diff --git a/mm/Kconfig b/mm/Kconfig
index 6ee3b48ed298..5ca8fcfae243 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -479,6 +479,12 @@ config SPARSEMEM_VMEMMAP
 	  SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
+#
+config ARCH_WANT_OPTIMIZE_VMEMMAP
+	bool
 
 config HAVE_MEMBLOCK_PHYS_MAP
 	bool
-- 
cgit