diff options
author | Jakub Kicinski <kuba@kernel.org> | 2024-05-15 07:29:56 -0700 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2024-05-15 07:30:49 -0700 |
commit | 621cde16e49b3ecf7d59a8106a20aaebfb4a59a9 (patch) | |
tree | bb4b8e255e276950c8d9502998f83a7f4f5bf5e9 /mm | |
parent | 317a215d493230da361028ea8a4675de334bfa1a (diff) | |
parent | 1b294a1f35616977caddaddf3e9d28e576a1adbc (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Cross merge.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 7 | ||||
-rw-r--r-- | mm/filemap.c | 60 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 1 | ||||
-rw-r--r-- | mm/page_owner.c | 4 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 3 | ||||
-rw-r--r-- | mm/slab.h | 3 | ||||
-rw-r--r-- | mm/slab_common.c | 27 | ||||
-rw-r--r-- | mm/slub.c | 118 | ||||
-rw-r--r-- | mm/userfaultfd.c | 35 | ||||
-rw-r--r-- | mm/vmalloc.c | 2 |
13 files changed, 209 insertions, 66 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b1448aa81e15..f30a18a0e37d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -333,10 +333,9 @@ config SHUFFLE_PAGE_ALLOCATOR While the randomization improves cache utilization it may negatively impact workloads on platforms without a cache. For - this reason, by default, the randomization is enabled only - after runtime detection of a direct-mapped memory-side-cache. - Otherwise, the randomization may be force enabled with the - 'page_alloc.shuffle' kernel command line parameter. + this reason, by default, the randomization is not enabled even + if SHUFFLE_PAGE_ALLOCATOR=y. The randomization may be force enabled + with the 'page_alloc.shuffle' kernel command line parameter. Say Y if unsure. diff --git a/mm/filemap.c b/mm/filemap.c index 30de18c4fd28..1d6b3a369077 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1540,7 +1540,7 @@ EXPORT_SYMBOL(folio_end_private_2); * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { @@ -1553,8 +1553,8 @@ EXPORT_SYMBOL(folio_wait_private_2); * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a - * fatal signal is received by the calling task. + * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is + * received by the calling task. * * Return: * - 0 if successful. @@ -4134,6 +4134,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) } EXPORT_SYMBOL(filemap_release_folio); +/** + * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache + * @inode: The inode to flush + * @flush: Set to write back rather than simply invalidate. + * @start: First byte to in range. + * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start + * onwards. + * + * Invalidate all the folios on an inode that contribute to the specified + * range, possibly writing them back first. Whilst the operation is + * undertaken, the invalidate lock is held to prevent new folios from being + * installed. + */ +int filemap_invalidate_inode(struct inode *inode, bool flush, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t first = start >> PAGE_SHIFT; + pgoff_t last = end >> PAGE_SHIFT; + pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; + + if (!mapping || !mapping->nrpages || end < start) + goto out; + + /* Prevent new folios from being added to the inode. */ + filemap_invalidate_lock(mapping); + + if (!mapping->nrpages) + goto unlock; + + unmap_mapping_pages(mapping, first, nr, false); + + /* Write back the data if we're asked to. */ + if (flush) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + filemap_fdatawrite_wbc(mapping, &wbc); + } + + /* Wait for writeback to complete on all folios and discard. */ + truncate_inode_pages_range(mapping, start, end); + +unlock: + filemap_invalidate_unlock(mapping); +out: + return filemap_check_errors(mapping); +} +EXPORT_SYMBOL_GPL(filemap_invalidate_inode); + #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping diff --git a/mm/mmap.c b/mm/mmap.c index 6dbda99a47da..3490af70f259 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1294,7 +1294,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; - flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; + flags_mask = LEGACY_MAP_MASK; + if (file->f_op->fop_flags & FOP_MMAP_SYNC) + flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: diff --git a/mm/nommu.c b/mm/nommu.c index 5ec8f44e7ce9..a34a0e376611 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -355,6 +355,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages); + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e19b87049db..06fc89d981e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2546,6 +2546,7 @@ done: folio_batch_release(&wbc->fbatch); return NULL; } +EXPORT_SYMBOL_GPL(writeback_iter); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. diff --git a/mm/page_owner.c b/mm/page_owner.c index 742f432e5bf0..8eed0f3dc085 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -170,7 +170,7 @@ static void add_stack_record_to_list(struct stack_record *stack_record, /* Filter gfp_mask the same way stackdepot does, for consistency */ gfp_mask &= ~GFP_ZONEMASK; - gfp_mask &= (GFP_ATOMIC | GFP_KERNEL); + gfp_mask &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP); gfp_mask |= __GFP_NOWARN; set_current_in_page_owner(); @@ -328,7 +328,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, if (unlikely(!page_ext)) return; __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, - current->pid, current->tgid, ts_nsec, + ts_nsec, current->pid, current->tgid, current->comm); page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask, 1 << order); diff --git a/mm/readahead.c b/mm/readahead.c index 130c0e7df99f..d55138e9560b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -490,6 +490,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; + unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -504,6 +505,8 @@ void page_cache_ra_order(struct readahead_control *ractl, new_order = min_t(unsigned int, new_order, ilog2(ra->size)); } + /* See comment in page_cache_ra_unbounded() */ + nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; @@ -527,6 +530,7 @@ void page_cache_ra_order(struct readahead_control *ractl, read_pages(ractl); filemap_invalidate_unlock_shared(mapping); + memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have diff --git a/mm/shmem.c b/mm/shmem.c index 94ab99b6b574..1f84a41aeb85 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3467,8 +3467,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return error; } - simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); - error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; diff --git a/mm/slab.h b/mm/slab.h index d2bc9b191222..78e205b46e19 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -496,9 +496,6 @@ struct slabinfo { }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos); #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON diff --git a/mm/slab_common.c b/mm/slab_common.c index f5234672f03c..c37f8c41ffb0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -916,22 +916,15 @@ void __init create_kmalloc_caches(void) * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, type); - - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && i == 6 && - !kmalloc_caches[type][1]) - new_kmalloc_cache(1, type); - if (KMALLOC_MIN_SIZE <= 64 && i == 7 && - !kmalloc_caches[type][2]) - new_kmalloc_cache(2, type); - } + /* Caches that are NOT of the two-to-the-power-of size. */ + if (KMALLOC_MIN_SIZE <= 32) + new_kmalloc_cache(1, type); + if (KMALLOC_MIN_SIZE <= 64) + new_kmalloc_cache(2, type); + + /* Caches that are of the two-to-the-power-of size. */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + new_kmalloc_cache(i, type); } #ifdef CONFIG_RANDOM_KMALLOC_CACHES random_kmalloc_seed = get_random_u64(); @@ -1078,7 +1071,6 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.limit, sinfo.batchcount, sinfo.shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); - slabinfo_show_stats(m, s); seq_putc(m, '\n'); } @@ -1155,7 +1147,6 @@ static const struct proc_ops slabinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, - .proc_write = slabinfo_write, .proc_lseek = seq_lseek, .proc_release = seq_release, }; diff --git a/mm/slub.c b/mm/slub.c index 24f702afd458..4954999183d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -624,11 +624,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} #else static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* @@ -2609,19 +2619,18 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!partial) { partial = slab; stat(s, ALLOC_FROM_PARTIAL); + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_slabs++; - } -#ifdef CONFIG_SLUB_CPU_PARTIAL - if (!kmem_cache_has_cpu_partial(s) - || partial_slabs > s->cpu_partial_slabs / 2) - break; -#else - break; -#endif + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } spin_unlock_irqrestore(&n->list_lock, flags); return partial; @@ -2704,7 +2713,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node, searchnode = numa_mem_id(); slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || node != NUMA_NO_NODE) + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) return slab; return get_any_partial(s, pc); @@ -2802,7 +2811,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, struct slab new; struct slab old; - if (slab->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } @@ -3234,6 +3243,43 @@ static unsigned long count_partial(struct kmem_cache_node *n, #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ #ifdef CONFIG_SLUB_DEBUG +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { @@ -3260,7 +3306,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) unsigned long nr_objs; unsigned long nr_free; - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -3380,6 +3426,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct slab *slab; unsigned long flags; struct partial_context pc; + bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); @@ -3506,6 +3553,21 @@ new_slab: new_objects: pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { @@ -3527,10 +3589,15 @@ new_objects: } slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, gfpflags, node); + slab = new_slab(s, pc.flags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } slab_out_of_memory(s, gfpflags, node); return NULL; } @@ -4232,7 +4299,7 @@ redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); - /* Same with comment on barrier() in slab_alloc_node() */ + /* Same with comment on barrier() in __slab_alloc_node() */ barrier(); if (unlikely(slab != c->slab)) { @@ -4853,7 +4920,6 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); @@ -5066,9 +5132,7 @@ static int calculate_sizes(struct kmem_cache *s) if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; @@ -6042,7 +6106,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = slab->slabs; + x = data_race(slab->slabs); total += x; nodes[node] += x; } @@ -6247,7 +6311,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) - slabs += slab->slabs; + slabs += data_race(slab->slabs); } #endif @@ -6261,7 +6325,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) { - slabs = READ_ONCE(slab->slabs); + slabs = data_race(slab->slabs); objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", cpu, objects, slabs); @@ -7095,7 +7159,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; @@ -7105,14 +7169,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} #endif /* CONFIG_SLUB_DEBUG */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3c3539c573e7..829f7b1089fc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -316,6 +316,38 @@ out_release: goto out; } +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct folio *folio; + int ret = -ENOMEM; + + folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); + if (!folio) + return ret; + + if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) + goto out_put; + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * zeroing out the folio become visible before mapping the page + * using set_pte_at(). See do_anonymous_page(). + */ + __folio_mark_uptodate(folio); + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, 0); + if (ret) + goto out_put; + + return 0; +out_put: + folio_put(folio); + return ret; +} + static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -324,6 +356,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, spinlock_t *ptl; int ret; + if (mm_forbids_zeropage(dst_vma->vm_mm)) + return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 68fa001648cc..125427cbdb87 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2710,7 +2710,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) * get_order(0) returns funny result. Just warn and terminate * early. */ - return NULL; + return ERR_PTR(-EINVAL); } order = get_order(size); |