diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/damon/core.c | 5 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/filemap.c | 19 | ||||
-rw-r--r-- | mm/gup.c | 24 | ||||
-rw-r--r-- | mm/memcontrol-v1.c | 44 | ||||
-rw-r--r-- | mm/migrate.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 136 | ||||
-rw-r--r-- | mm/page_frag_cache.c | 171 | ||||
-rw-r--r-- | mm/readahead.c | 17 | ||||
-rw-r--r-- | mm/shmem.c | 267 | ||||
-rw-r--r-- | mm/slab_common.c | 19 | ||||
-rw-r--r-- | mm/truncate.c | 16 |
15 files changed, 528 insertions, 215 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 33fa51d608dc..84000b016808 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1295,6 +1295,12 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. +config ARCH_HAS_USER_SHADOW_STACK + bool + help + The architecture has hardware support for userspace shadow call + stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index d5639b036166..dba52bb0da8a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -65,6 +65,7 @@ page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-y += page-alloc.o +obj-y += page_frag_cache.o obj-y += init-mm.o obj-y += memblock.o obj-y += $(memory-hotplug-y) diff --git a/mm/damon/core.c b/mm/damon/core.c index 511c3f61ab44..8b8e2933dcd4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1906,11 +1906,10 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) static void kdamond_usleep(unsigned long usecs) { - /* See Documentation/timers/timers-howto.rst for the thresholds */ - if (usecs > 20 * USEC_PER_MSEC) + if (usecs >= USLEEP_RANGE_UPPER_BOUND) schedule_timeout_idle(usecs_to_jiffies(usecs)); else - usleep_idle_range(usecs, usecs + 1); + usleep_range_idle(usecs, usecs + 1); } /* Returns negative error code if it's not activated but should return */ diff --git a/mm/fadvise.c b/mm/fadvise.c index 532dee205c6e..588fe76c5a14 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -190,16 +190,12 @@ EXPORT_SYMBOL(vfs_fadvise); int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { - struct fd f = fdget(fd); - int ret; + CLASS(fd, f)(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = vfs_fadvise(fd_file(f), offset, len, advice); - - fdput(f); - return ret; + return vfs_fadvise(fd_file(f), offset, len, advice); } SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) diff --git a/mm/filemap.c b/mm/filemap.c index 56fa431c52af..196779e8e396 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2620,6 +2620,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; + if (unlikely(iocb->ki_pos < 0)) + return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) @@ -4421,31 +4423,25 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, - sizeof(struct cachestat_range))) { - fdput(f); + sizeof(struct cachestat_range))) return -EFAULT; - } /* hugetlbfs is not supported */ - if (is_file_hugepages(fd_file(f))) { - fdput(f); + if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; - } - if (flags != 0) { - fdput(f); + if (flags != 0) return -EINVAL; - } first_index = csr.off >> PAGE_SHIFT; last_index = @@ -4453,7 +4449,6 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, memset(&cs, 0, sizeof(struct cachestat)); mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); - fdput(f); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; @@ -3760,3 +3760,27 @@ err: return ret; } EXPORT_SYMBOL_GPL(memfd_pin_folios); + +/** + * folio_add_pins() - add pins to an already-pinned folio + * @folio: the folio to add more pins to + * @pins: number of pins to add + * + * Try to add more pins to an already-pinned folio. The semantics + * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot + * be changed. + * + * This function is helpful when having obtained a pin on a large folio + * using memfd_pin_folios(), but wanting to logically unpin parts + * (e.g., individual pages) of the folio later, for example, using + * unpin_user_page_range_dirty_lock(). + * + * This is not the right interface to initially pin a folio. + */ +int folio_add_pins(struct folio *folio, unsigned int pins) +{ + VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio)); + + return try_grab_folio(folio, pins, FOLL_PIN); +} +EXPORT_SYMBOL_GPL(folio_add_pins); diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index f8744f5630bb..86527d8fa7b9 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1936,8 +1936,6 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; - struct fd efile; - struct fd cfile; struct dentry *cdentry; const char *name; char *endp; @@ -1961,6 +1959,12 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, else return -EINVAL; + CLASS(fd, efile)(efd); + if (fd_empty(efile)) + return -EBADF; + + CLASS(fd, cfile)(cfd); + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; @@ -1971,20 +1975,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, init_waitqueue_func_entry(&event->wait, memcg_event_wake); INIT_WORK(&event->remove, memcg_event_remove); - efile = fdget(efd); - if (!fd_file(efile)) { - ret = -EBADF; - goto out_kfree; - } - event->eventfd = eventfd_ctx_fileget(fd_file(efile)); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto out_put_efile; + goto out_kfree; } - cfile = fdget(cfd); - if (!fd_file(cfile)) { + if (fd_empty(cfile)) { ret = -EBADF; goto out_put_eventfd; } @@ -1993,7 +1990,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, /* AV: shouldn't we check that it's been opened for read instead? */ ret = file_permission(fd_file(cfile), MAY_READ); if (ret < 0) - goto out_put_cfile; + goto out_put_eventfd; /* * The control file must be a regular cgroup1 file. As a regular cgroup @@ -2002,7 +1999,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, cdentry = fd_file(cfile)->f_path.dentry; if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_eventfd; } /* @@ -2035,7 +2032,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, event->unregister_event = memsw_cgroup_usage_unregister_event; } else { ret = -EINVAL; - goto out_put_cfile; + goto out_put_eventfd; } /* @@ -2047,11 +2044,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) - goto out_put_cfile; - if (cfile_css != css) { - css_put(cfile_css); - goto out_put_cfile; - } + goto out_put_eventfd; + if (cfile_css != css) + goto out_put_css; ret = event->register_event(memcg, event->eventfd, buf); if (ret) @@ -2062,23 +2057,14 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, spin_lock_irq(&memcg->event_list_lock); list_add(&event->list, &memcg->event_list); spin_unlock_irq(&memcg->event_list_lock); - - fdput(cfile); - fdput(efile); - return nbytes; out_put_css: - css_put(css); -out_put_cfile: - fdput(cfile); + css_put(cfile_css); out_put_eventfd: eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); out_kfree: kfree(event); - return ret; } diff --git a/mm/migrate.c b/mm/migrate.c index dfa24e41e8f9..47a8d102ae7a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -473,7 +473,7 @@ static int folio_expected_refs(struct address_space *mapping, * The number of remaining references must be: * 1 for anonymous folios without a mapping * 2 for folios with a mapping - * 3 for folios with a mapping and PagePrivate/PagePrivate2 set. + * 3 for folios with a mapping and the private flag set. */ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) @@ -787,7 +787,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, * @mode: How to migrate the page. * * Common logic to directly migrate a single LRU folio suitable for - * folios that do not use PagePrivate/PagePrivate2. + * folios that do not have private data. * * Folios are locked upon entry and exit. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fcd4c1439cb9..72a5d8836425 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -54,7 +54,7 @@ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* - * Estimate write bandwidth at 200ms intervals. + * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) @@ -586,7 +586,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom, /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* - * We can race with other __bdi_writeout_inc calls here but + * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b6958333054d..119a3a52f96e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4854,142 +4854,6 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); -/* - * Page Fragment: - * An arbitrary-length arbitrary-offset area of memory which resides - * within a 0 or higher order page. Multiple fragments within that page - * are individually refcounted, in the page's reference counter. - * - * The page_frag functions below provide a simple allocation framework for - * page fragments. This is used by the network stack and network device - * drivers to provide a backing region of memory for use as either an - * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. - */ -static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) -{ - struct page *page = NULL; - gfp_t gfp = gfp_mask; - -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | - __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, - PAGE_FRAG_CACHE_MAX_ORDER); - nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; -#endif - if (unlikely(!page)) - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - - nc->va = page ? page_address(page) : NULL; - - return page; -} - -void page_frag_cache_drain(struct page_frag_cache *nc) -{ - if (!nc->va) - return; - - __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); - nc->va = NULL; -} -EXPORT_SYMBOL(page_frag_cache_drain); - -void __page_frag_cache_drain(struct page *page, unsigned int count) -{ - VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); - - if (page_ref_sub_and_test(page, count)) - free_unref_page(page, compound_order(page)); -} -EXPORT_SYMBOL(__page_frag_cache_drain); - -void *__page_frag_alloc_align(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask) -{ - unsigned int size = PAGE_SIZE; - struct page *page; - int offset; - - if (unlikely(!nc->va)) { -refill: - page = __page_frag_cache_refill(nc, gfp_mask); - if (!page) - return NULL; - -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - /* Even if we own the page, we do not use atomic_set(). - * This would break get_page_unless_zero() users. - */ - page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); - - /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; - nc->offset = size; - } - - offset = nc->offset - fragsz; - if (unlikely(offset < 0)) { - page = virt_to_page(nc->va); - - if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) - goto refill; - - if (unlikely(nc->pfmemalloc)) { - free_unref_page(page, compound_order(page)); - goto refill; - } - -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - /* OK, page count is 0, we can safely set it */ - set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; - offset = size - fragsz; - if (unlikely(offset < 0)) { - /* - * The caller is trying to allocate a fragment - * with fragsz > PAGE_SIZE but the cache isn't big - * enough to satisfy the request, this may - * happen in low memory conditions. - * We don't release the cache page because - * it could make memory pressure worse - * so we simply return NULL here. - */ - return NULL; - } - } - - nc->pagecnt_bias--; - offset &= align_mask; - nc->offset = offset; - - return nc->va + offset; -} -EXPORT_SYMBOL(__page_frag_alloc_align); - -/* - * Frees a page fragment allocated out of either a compound or order 0 page. - */ -void page_frag_free(void *addr) -{ - struct page *page = virt_to_head_page(addr); - - if (unlikely(put_page_testzero(page))) - free_unref_page(page, compound_order(page)); -} -EXPORT_SYMBOL(page_frag_free); - static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c new file mode 100644 index 000000000000..3f7a203d35c6 --- /dev/null +++ b/mm/page_frag_cache.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Page fragment allocator + * + * Page Fragment: + * An arbitrary-length arbitrary-offset area of memory which resides within a + * 0 or higher order page. Multiple fragments within that page are + * individually refcounted, in the page's reference counter. + * + * The page_frag functions provide a simple allocation framework for page + * fragments. This is used by the network stack and network device drivers to + * provide a backing region of memory for use as either an sk_buff->head, or to + * be used in the "frags" portion of skb_shared_info. + */ + +#include <linux/build_bug.h> +#include <linux/export.h> +#include <linux/gfp_types.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/page_frag_cache.h> +#include "internal.h" + +static unsigned long encoded_page_create(struct page *page, unsigned int order, + bool pfmemalloc) +{ + BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK); + BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE); + + return (unsigned long)page_address(page) | + (order & PAGE_FRAG_CACHE_ORDER_MASK) | + ((unsigned long)pfmemalloc * PAGE_FRAG_CACHE_PFMEMALLOC_BIT); +} + +static unsigned long encoded_page_decode_order(unsigned long encoded_page) +{ + return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK; +} + +static void *encoded_page_decode_virt(unsigned long encoded_page) +{ + return (void *)(encoded_page & PAGE_MASK); +} + +static struct page *encoded_page_decode_page(unsigned long encoded_page) +{ + return virt_to_page((void *)encoded_page); +} + +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) +{ + unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER; + struct page *page = NULL; + gfp_t gfp = gfp_mask; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; + page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER, + numa_mem_id(), NULL); +#endif + if (unlikely(!page)) { + page = __alloc_pages(gfp, 0, numa_mem_id(), NULL); + order = 0; + } + + nc->encoded_page = page ? + encoded_page_create(page, order, page_is_pfmemalloc(page)) : 0; + + return page; +} + +void page_frag_cache_drain(struct page_frag_cache *nc) +{ + if (!nc->encoded_page) + return; + + __page_frag_cache_drain(encoded_page_decode_page(nc->encoded_page), + nc->pagecnt_bias); + nc->encoded_page = 0; +} +EXPORT_SYMBOL(page_frag_cache_drain); + +void __page_frag_cache_drain(struct page *page, unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) + free_unref_page(page, compound_order(page)); +} +EXPORT_SYMBOL(__page_frag_cache_drain); + +void *__page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) +{ + unsigned long encoded_page = nc->encoded_page; + unsigned int size, offset; + struct page *page; + + if (unlikely(!encoded_page)) { +refill: + page = __page_frag_cache_refill(nc, gfp_mask); + if (!page) + return NULL; + + encoded_page = nc->encoded_page; + + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; + nc->offset = 0; + } + + size = PAGE_SIZE << encoded_page_decode_order(encoded_page); + offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); + if (unlikely(offset + fragsz > size)) { + if (unlikely(fragsz > PAGE_SIZE)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } + + page = encoded_page_decode_page(encoded_page); + + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) + goto refill; + + if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { + free_unref_page(page, + encoded_page_decode_order(encoded_page)); + goto refill; + } + + /* OK, page count is 0, we can safely set it */ + set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; + offset = 0; + } + + nc->pagecnt_bias--; + nc->offset = offset + fragsz; + + return encoded_page_decode_virt(encoded_page) + offset; +} +EXPORT_SYMBOL(__page_frag_alloc_align); + +/* + * Frees a page fragment allocated out of either a compound or order 0 page. + */ +void page_frag_free(void *addr) +{ + struct page *page = virt_to_head_page(addr); + + if (unlikely(put_page_testzero(page))) + free_unref_page(page, compound_order(page)); +} +EXPORT_SYMBOL(page_frag_free); diff --git a/mm/readahead.c b/mm/readahead.c index 3dc6c7a128dd..9a807727d809 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -673,29 +673,22 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { - ssize_t ret; - struct fd f; + CLASS(fd, f)(fd); - ret = -EBADF; - f = fdget(fd); - if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ)) - goto out; + if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + return -EBADF; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ - ret = -EINVAL; if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || (!S_ISREG(file_inode(fd_file(f))->i_mode) && !S_ISBLK(file_inode(fd_file(f))->i_mode))) - goto out; + return -EINVAL; - ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); -out: - fdput(f); - return ret; + return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) diff --git a/mm/shmem.c b/mm/shmem.c index 568bb290bdce..c7881e16f4be 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -40,6 +40,7 @@ #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> +#include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; @@ -123,6 +124,10 @@ struct shmem_options { bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *encoding; + bool strict_encoding; +#endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 @@ -2749,13 +2754,62 @@ static int shmem_file_open(struct inode *inode, struct file *file) #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); +#if IS_ENABLED(CONFIG_UNICODE) +/* + * shmem_inode_casefold_flags - Deal with casefold file attribute flag + * + * The casefold file attribute needs some special checks. I can just be added to + * an empty dir, and can't be removed from a non-empty dir. + */ +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + unsigned int old = inode->i_flags; + struct super_block *sb = inode->i_sb; + + if (fsflags & FS_CASEFOLD_FL) { + if (!(old & S_CASEFOLD)) { + if (!sb->s_encoding) + return -EOPNOTSUPP; + + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + *i_flags = *i_flags | S_CASEFOLD; + } else if (old & S_CASEFOLD) { + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + return 0; +} +#else +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + if (fsflags & FS_CASEFOLD_FL) + return -EOPNOTSUPP; + + return 0; +} +#endif + /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; + int ret; + + ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); + if (ret) + return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; @@ -2766,10 +2820,12 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) /* * But FS_NODUMP_FL does not require any action in i_flags. */ - inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); + + return 0; } #else -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL @@ -2816,7 +2872,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) - shmem_set_inode_flags(inode, info->fsflags); + shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -3562,6 +3618,9 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + return -EINVAL; + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3581,7 +3640,12 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); + + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ return error; @@ -3672,7 +3736,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ - d_instantiate(dentry, inode); + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); out: return ret; } @@ -3692,6 +3759,14 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - does all the work */ + + /* + * For now, VFS can't deal with case-insensitive negative dentries, so + * we invalidate them + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + return 0; } @@ -3836,7 +3911,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3901,16 +3979,23 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); + int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; - info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - shmem_set_inode_flags(inode, info->fsflags); + ret = shmem_set_inode_flags(inode, flags, dentry); + + if (ret) + return ret; + + info->fsflags = flags; + inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; @@ -4189,6 +4274,9 @@ enum shmem_param { Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, + Opt_casefold_version, + Opt_casefold, + Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -4220,9 +4308,54 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif + fsparam_string("casefold", Opt_casefold_version), + fsparam_flag ("casefold", Opt_casefold), + fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; +#if IS_ENABLED(CONFIG_UNICODE) +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + struct shmem_options *ctx = fc->fs_private; + unsigned int version = UTF8_LATEST; + struct unicode_map *encoding; + char *version_str = param->string + 5; + + if (!latest_version) { + if (strncmp(param->string, "utf8-", 5)) + return invalfc(fc, "Only UTF-8 encodings are supported " + "in the format: utf8-<version number>"); + + version = utf8_parse_version(version_str); + if (version < 0) + return invalfc(fc, "Invalid UTF-8 version: %s", version_str); + } + + encoding = utf8_load(version); + + if (IS_ERR(encoding)) { + return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), + unicode_rev(version)); + } + + pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), unicode_rev(version)); + + ctx->encoding = encoding; + + return 0; +} +#else +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +} +#endif + static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; @@ -4381,6 +4514,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; + case Opt_casefold_version: + return shmem_parse_opt_casefold(fc, param, false); + case Opt_casefold: + return shmem_parse_opt_casefold(fc, param, true); + case Opt_strict_encoding: +#if IS_ENABLED(CONFIG_UNICODE) + ctx->strict_encoding = true; + break; +#else + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +#endif } return 0; @@ -4610,6 +4754,11 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#if IS_ENABLED(CONFIG_UNICODE) + if (sb->s_encoding) + utf8_unload(sb->s_encoding); +#endif + #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif @@ -4620,6 +4769,14 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) +static const struct dentry_operations shmem_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, + .d_delete = always_delete_dentry, +}; +#endif + static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; @@ -4654,9 +4811,25 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC | SB_I_VERSION; + +#if IS_ENABLED(CONFIG_UNICODE) + if (!ctx->encoding && ctx->strict_encoding) { + pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); + error = -EINVAL; + goto failed; + } + + if (ctx->encoding) { + sb->s_encoding = ctx->encoding; + sb->s_d_op = &shmem_ci_dentry_ops; + if (ctx->strict_encoding) + sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; + } +#endif + #else sb->s_flags |= SB_NOUSER; -#endif +#endif /* CONFIG_TMPFS */ sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; @@ -4930,6 +5103,10 @@ int shmem_init_fs_context(struct fs_context *fc) ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); +#if IS_ENABLED(CONFIG_UNICODE) + ctx->encoding = NULL; +#endif + fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; return 0; @@ -4943,9 +5120,69 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, +}; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define TMPFS_ATTR_W(_name, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + +#define TMPFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define TMPFS_ATTR_RO(_name, _show) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#if IS_ENABLED(CONFIG_UNICODE) +static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, + char *buf) +{ + return sysfs_emit(buf, "supported\n"); +} +TMPFS_ATTR_RO(casefold, casefold_show); +#endif + +static struct attribute *tmpfs_attributes[] = { +#if IS_ENABLED(CONFIG_UNICODE) + &tmpfs_attr_casefold.attr, +#endif + NULL }; +static const struct attribute_group tmpfs_attribute_group = { + .attrs = tmpfs_attributes, + .name = "features" +}; + +static struct kobject *tmpfs_kobj; + +static int __init tmpfs_sysfs_init(void) +{ + int ret; + + tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); + if (!tmpfs_kobj) + return -ENOMEM; + + ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); + if (ret) + kobject_put(tmpfs_kobj); + + return ret; +} +#endif /* CONFIG_SYSFS && CONFIG_TMPFS */ + void __init shmem_init(void) { int error; @@ -4969,6 +5206,14 @@ void __init shmem_init(void) goto out1; } +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + error = tmpfs_sysfs_init(); + if (error) { + pr_err("Could not init tmpfs sysfs\n"); + goto out1; + } +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; diff --git a/mm/slab_common.c b/mm/slab_common.c index 893d32059915..a7174455db9f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1332,6 +1332,25 @@ size_t ksize(const void *objp) } EXPORT_SYMBOL(ksize); +#ifdef CONFIG_BPF_SYSCALL +#include <linux/btf.h> + +__bpf_kfunc_start_defs(); + +__bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr) +{ + struct slab *slab; + + if (!virt_addr_valid((void *)(long)addr)) + return NULL; + + slab = virt_to_slab((void *)(long)addr); + return slab ? slab->slab_cache : NULL; +} + +__bpf_kfunc_end_defs(); +#endif /* CONFIG_BPF_SYSCALL */ + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/truncate.c b/mm/truncate.c index 0668cd340a46..09fa809f921d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -166,7 +166,6 @@ static void truncate_cleanup_folio(struct folio *folio) * Hence dirty accounting check is placed after invalidation. */ folio_cancel_dirty(folio); - folio_clear_mappedtodisk(folio); } int truncate_inode_folio(struct address_space *mapping, struct folio *folio) @@ -797,6 +796,21 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) */ if (folio_mkclean(folio)) folio_mark_dirty(folio); + + /* + * The post-eof range of the folio must be zeroed before it is exposed + * to the file. Writeback normally does this, but since i_size has been + * increased we handle it here. + */ + if (folio_test_dirty(folio)) { + unsigned int offset, end; + + offset = from - folio_pos(folio); + end = min_t(unsigned int, to - folio_pos(folio), + folio_size(folio)); + folio_zero_segment(folio, offset, end); + } + folio_unlock(folio); folio_put(folio); } |