summaryrefslogtreecommitdiff
path: root/mm/shmem.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2023-10-19 13:39:08 -0700
committerAndrew Morton <akpm@linux-foundation.org>2023-10-25 16:47:16 -0700
commitddc1a5cbc05dc62743a2f409b96faa5cf95ba064 (patch)
tree6d3b7083fd984acd1391818a32ef9a7dd46049e7 /mm/shmem.c
parent23e4883248f0472d806c8b3422ba6257e67bf1a5 (diff)
mempolicy: alloc_pages_mpol() for NUMA policy without vma
Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c92
1 files changed, 51 insertions, 41 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index bcbe9dbb9f5f..a314a25aea8c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */
-#ifndef CONFIG_NUMA
-#define vm_policy vm_private_data
-#endif
-
-static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
- struct shmem_inode_info *info, pgoff_t index)
-{
- /* Create a pseudo vma that just contains the policy */
- vma_init(vma, NULL);
- /* Bias interleave by inode number to distribute better across nodes */
- vma->vm_pgoff = index + info->vfs_inode.i_ino;
- vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-}
-static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
-{
- /* Drop reference taken by mpol_shared_policy_lookup() */
- mpol_cond_put(vma->vm_policy);
-}
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx);
-static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
struct page *page;
- struct vm_fault vmf = {
- .vma = &pvma,
- };
- shmem_pseudo_vma_init(&pvma, info, index);
- page = swap_cluster_readahead(swap, gfp, &vmf);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+ mpol_cond_put(mpol);
if (!page)
return NULL;
@@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
- struct folio *folio;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- shmem_pseudo_vma_init(&pvma, info, index);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
+ page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
+ mpol_cond_put(mpol);
- return folio;
+ return page_rmappable_folio(page);
}
static struct folio *shmem_alloc_folio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
- struct folio *folio;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- shmem_pseudo_vma_init(&pvma, info, index);
- folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
+ mpol_cond_put(mpol);
- return folio;
+ return (struct folio *)page;
}
static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
@@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- folio = shmem_swapin(swap, gfp, info, index);
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
if (!folio) {
error = -ENOMEM;
goto failed;
@@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
}
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long addr, pgoff_t *ilx)
{
struct inode *inode = file_inode(vma->vm_file);
pgoff_t index;
+ /*
+ * Bias interleave by inode number to distribute better across nodes;
+ * but this interface is independent of which page order is used, so
+ * supplies only that bias, letting caller apply the offset (adjusted
+ * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
+ */
+ *ilx = inode->i_ino;
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
-#endif
+
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+ struct mempolicy *mpol;
+
+ /* Bias interleave by inode number to distribute better across nodes */
+ *ilx = info->vfs_inode.i_ino + (index >> order);
+
+ mpol = mpol_shared_policy_lookup(&info->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+}
+#else
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+ *ilx = 0;
+ return NULL;
+}
+#endif /* CONFIG_NUMA */
int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{