summaryrefslogtreecommitdiff
path: root/mm/mmu_gather.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mmu_gather.c')
-rw-r--r--mm/mmu_gather.c450
1 files changed, 346 insertions, 104 deletions
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index f2f03c655807..247e3f9db6c7 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -3,20 +3,26 @@
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
+#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
+#include <linux/rmap.h>
+#include <linux/pgalloc.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
-#ifdef HAVE_GENERIC_MMU_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
+ /* Limit batching if we have delayed rmaps pending */
+ if (tlb->delayed_rmap && tlb->active != &tlb->local)
+ return false;
+
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
@@ -26,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ batch = (void *)__get_free_page(GFP_NOWAIT);
if (!batch)
return false;
@@ -41,66 +47,112 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
-void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
+#ifdef CONFIG_SMP
+static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
- tlb->mm = mm;
+ struct encoded_page **pages = batch->encoded_pages;
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
- tlb->need_flush_all = 0;
- tlb->local.next = NULL;
- tlb->local.nr = 0;
- tlb->local.max = ARRAY_SIZE(tlb->__pages);
- tlb->active = &tlb->local;
- tlb->batch_count = 0;
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
- tlb->page_size = 0;
+ for (int i = 0; i < batch->nr; i++) {
+ struct encoded_page *enc = pages[i];
- __tlb_reset_range(tlb);
-}
+ if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
+ struct page *page = encoded_page_ptr(enc);
+ unsigned int nr_pages = 1;
-void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
+ if (unlikely(encoded_page_flags(enc) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages = encoded_nr_pages(pages[++i]);
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb_table_flush(tlb);
-#endif
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- free_pages_and_swap_cache(batch->pages, batch->nr);
- batch->nr = 0;
+ folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
+ vma);
+ }
}
- tlb->active = &tlb->local;
}
-void tlb_flush_mmu(struct mmu_gather *tlb)
+/**
+ * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
+ * @tlb: the current mmu_gather
+ * @vma: The memory area from which the pages are being removed.
+ *
+ * Note that because of how tlb_next_batch() above works, we will
+ * never start multiple new batches with pending delayed rmaps, so
+ * we only need to walk through the current active batch and the
+ * original local one.
+ */
+void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
- tlb_flush_mmu_tlbonly(tlb);
- tlb_flush_mmu_free(tlb);
+ if (!tlb->delayed_rmap)
+ return;
+
+ tlb_flush_rmap_batch(&tlb->local, vma);
+ if (tlb->active != &tlb->local)
+ tlb_flush_rmap_batch(tlb->active, vma);
+ tlb->delayed_rmap = 0;
}
+#endif
-/* tlb_finish_mmu
- * Called at the end of the shootdown operation to free up any resources
- * that were required.
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
*/
-void arch_tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end, bool force)
+#define MAX_NR_FOLIOS_PER_FREE 512
+
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
- struct mmu_gather_batch *batch, *next;
+ struct encoded_page **pages = batch->encoded_pages;
+ unsigned int nr, nr_pages;
+
+ while (batch->nr) {
+ if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+ nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
+
+ /*
+ * Make sure we cover page + nr_pages, and don't leave
+ * nr_pages behind when capping the number of entries.
+ */
+ if (unlikely(encoded_page_flags(pages[nr - 1]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr++;
+ } else {
+ /*
+ * With page poisoning and init_on_free, the time it
+ * takes to free memory grows proportionally with the
+ * actual memory size. Therefore, limit based on the
+ * actual memory size and not the number of involved
+ * folios.
+ */
+ for (nr = 0, nr_pages = 0;
+ nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
+ nr++) {
+ if (unlikely(encoded_page_flags(pages[nr]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages += encoded_nr_pages(pages[++nr]);
+ else
+ nr_pages++;
+ }
+ }
- if (force) {
- __tlb_reset_range(tlb);
- __tlb_adjust_range(tlb, start, end - start);
+ free_pages_and_swap_cache(pages, nr);
+ pages += nr;
+ batch->nr -= nr;
+
+ cond_resched();
}
+}
- tlb_flush_mmu(tlb);
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
- /* keep the page table cache within bounds */
- check_pgt_cache();
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
+ __tlb_batch_free_encoded_pages(batch);
+ tlb->active = &tlb->local;
+}
+
+static void tlb_batch_list_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch, *next;
for (batch = tlb->local.next; batch; batch = next) {
next = batch->next;
@@ -109,97 +161,200 @@ void arch_tlb_finish_mmu(struct mmu_gather *tlb,
tlb->local.next = NULL;
}
-/* __tlb_remove_page
- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
- * handling the additional races in SMP caused by other CPUs caching valid
- * mappings in their TLBs. Returns the number of free page slots left.
- * When out of page slots we must call tlb_flush_mmu().
- *returns true if the caller should flush.
- */
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
+ struct page *page, unsigned int nr_pages, bool delay_rmap,
+ int page_size)
{
+ int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
+ VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
+ VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
+#endif
batch = tlb->active;
/*
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
+ if (likely(nr_pages == 1)) {
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ } else {
+ flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
+ }
+ /*
+ * Make sure that we can always add another "page" + "nr_pages",
+ * requiring two entries instead of only a single one.
+ */
+ if (batch->nr >= batch->max - 1) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+ VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
return false;
}
-#endif /* HAVE_GENERIC_MMU_GATHER */
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+ unsigned int nr_pages, bool delay_rmap)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
+ PAGE_SIZE);
+}
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ bool delay_rmap, int page_size)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
+}
-/*
- * See the comment near struct mmu_table_batch.
- */
+#endif /* MMU_GATHER_NO_GATHER */
-/*
- * If we want tlb_remove_table() to imply TLB invalidates.
- */
-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+#ifdef CONFIG_MMU_GATHER_TABLE_FREE
+
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
+ int i;
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
}
+#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
+
+/*
+ * Semi RCU freeing of the page directories.
+ *
+ * This is needed by some architectures to implement software pagetable walkers.
+ *
+ * gup_fast() and other software pagetable walkers do a lockless page-table
+ * walk and therefore needs some synchronization with the freeing of the page
+ * directories. The chosen means to accomplish that is by disabling IRQs over
+ * the walk.
+ *
+ * Architectures that use IPIs to flush TLBs will then automagically DTRT,
+ * since we unlink the page, flush TLBs, free the page. Since the disabling of
+ * IRQs delays the completion of the TLB flush we can never observe an already
+ * freed page.
+ *
+ * Not all systems IPI every CPU for this purpose:
+ *
+ * - Some architectures have HW support for cross-CPU synchronisation of TLB
+ * flushes, so there's no IPI at all.
+ *
+ * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
+ * with the hypervisor to defer flushing on preempted vCPUs.
+ *
+ * Such systems need to delay the freeing by some other means, this is that
+ * means.
+ *
+ * What we do is batch the freed directory pages (tables) and RCU free them.
+ * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
+ * holds off grace periods.
+ *
+ * However, in order to batch these pages we need to allocate storage, this
+ * allocation is deep inside the MM code and can thus easily fail on memory
+ * pressure. To guarantee progress we fall back to single table freeing, see
+ * the implementation of tlb_remove_table_one().
+ *
+ */
+
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_one(void *table)
+void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
*
* It is however sufficient for software page-table walkers that rely on
- * IRQ disabling. See the comment near struct mmu_table_batch.
+ * IRQ disabling.
*/
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
}
static void tlb_remove_table_rcu(struct rcu_head *head)
{
- struct mmu_table_batch *batch;
- int i;
+ __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
+}
- batch = container_of(head, struct mmu_table_batch, rcu);
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ call_rcu(&batch->rcu, tlb_remove_table_rcu);
+}
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
+#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
- free_page((unsigned long)batch);
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ __tlb_remove_table_free(batch);
+}
+
+#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
+}
+
+#ifdef CONFIG_PT_RECLAIM
+static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ __tlb_remove_table(ptdesc);
+}
+
+static inline void __tlb_remove_table_one(void *table)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = table;
+ call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
+}
+#else
+static inline void __tlb_remove_table_one(void *table)
+{
+ tlb_remove_table_sync_one();
+ __tlb_remove_table(table);
}
+#endif /* CONFIG_PT_RECLAIM */
-void tlb_table_flush(struct mmu_gather *tlb)
+static void tlb_remove_table_one(void *table)
+{
+ __tlb_remove_table_one(table);
+}
+
+static void tlb_table_flush(struct mmu_gather *tlb)
{
struct mmu_table_batch **batch = &tlb->batch;
if (*batch) {
tlb_table_invalidate(tlb);
- call_rcu(&(*batch)->rcu, tlb_remove_table_rcu);
+ tlb_remove_table_free(*batch);
*batch = NULL;
}
}
@@ -209,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
struct mmu_table_batch **batch = &tlb->batch;
if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
tlb_remove_table_one(table);
@@ -223,39 +378,126 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_table_flush(tlb);
}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+static inline void tlb_table_init(struct mmu_gather *tlb)
+{
+ tlb->batch = NULL;
+}
+
+#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
+
+static inline void tlb_table_flush(struct mmu_gather *tlb) { }
+static inline void tlb_table_init(struct mmu_gather *tlb) { }
+
+#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
+
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+ tlb_table_flush(tlb);
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+ tlb_batch_pages_flush(tlb);
+#endif
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
+static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ bool fullmm)
+{
+ tlb->mm = mm;
+ tlb->fullmm = fullmm;
+
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+ tlb->need_flush_all = 0;
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+ tlb->batch_count = 0;
+#endif
+ tlb->delayed_rmap = 0;
+
+ tlb_table_init(tlb);
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
+ tlb->page_size = 0;
+#endif
+ tlb->vma_pfn = 0;
+
+ __tlb_reset_range(tlb);
+ inc_tlb_flush_pending(tlb->mm);
+}
/**
* tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
* @tlb: the mmu_gather structure to initialize
* @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
*
* Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
+ * tear-down from @mm.
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
- arch_tlb_gather_mmu(tlb, mm, start, end);
- inc_tlb_flush_pending(tlb->mm);
+ __tlb_gather_mmu(tlb, mm, false);
}
-void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
+/**
+ * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ *
+ * In this case, @mm is without users and we're going to destroy the
+ * full address space (exit/execve).
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm.
+ */
+void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, true);
+}
+
+/**
+ * tlb_finish_mmu - finish an mmu_gather structure
+ * @tlb: the mmu_gather structure to finish
+ *
+ * Called at the end of the shootdown operation to free up any resources that
+ * were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
* If there are parallel threads are doing PTE changes on same range
- * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
- * flush by batching, a thread has stable TLB entry can fail to flush
- * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
- * forcefully if we detect parallel PTE batching threads.
+ * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
+ * flush by batching, one thread may end up seeing inconsistent PTEs
+ * and result in having stale TLB entries. So flush TLB forcefully
+ * if we detect parallel PTE batching threads.
+ *
+ * However, some syscalls, e.g. munmap(), may free page tables, this
+ * needs force flush everything in the given range. Otherwise this
+ * may result in having stale TLB entries for some architectures,
+ * e.g. aarch64, that could specify flush what level TLB.
*/
- bool force = mm_tlb_flush_nested(tlb->mm);
+ if (mm_tlb_flush_nested(tlb->mm)) {
+ /*
+ * The aarch64 yields better performance with fullmm by
+ * avoiding multiple CPUs spamming TLBI messages at the
+ * same time.
+ *
+ * On x86 non-fullmm doesn't yield significant difference
+ * against fullmm.
+ */
+ tlb->fullmm = 1;
+ __tlb_reset_range(tlb);
+ tlb->freed_tables = 1;
+ }
- arch_tlb_finish_mmu(tlb, start, end, force);
+ tlb_flush_mmu(tlb);
+
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+ tlb_batch_list_free(tlb);
+#endif
dec_tlb_flush_pending(tlb->mm);
}