diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/cma.c | 10 | ||||
-rw-r--r-- | mm/debug_vm_pgtable.c | 12 | ||||
-rw-r--r-- | mm/filemap.c | 65 | ||||
-rw-r--r-- | mm/folio-compat.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 17 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kfence/kfence_test.c | 7 | ||||
-rw-r--r-- | mm/kfence/report.c | 3 | ||||
-rw-r--r-- | mm/khugepaged.c | 13 | ||||
-rw-r--r-- | mm/memory.c | 13 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/migrate_device.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 14 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 49 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 69 | ||||
-rw-r--r-- | mm/readahead.c | 13 | ||||
-rw-r--r-- | mm/shmem.c | 851 | ||||
-rw-r--r-- | mm/shmem_quota.c | 350 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 54 | ||||
-rw-r--r-- | mm/slub.c | 58 | ||||
-rw-r--r-- | mm/truncate.c | 4 | ||||
-rw-r--r-- | mm/userfaultfd.c | 2 | ||||
-rw-r--r-- | mm/util.c | 7 | ||||
-rw-r--r-- | mm/vmpressure.c | 8 |
32 files changed, 1220 insertions, 446 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 721dc88423c7..264a2df5ecf5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -336,6 +336,23 @@ config SLUB_CPU_PARTIAL which requires the taking of locks that may cause latency spikes. Typically one would choose no for a realtime system. +config RANDOM_KMALLOC_CACHES + default n + depends on SLUB && !SLUB_TINY + bool "Randomize slab caches for normal kmalloc" + help + A hardening feature that creates multiple copies of slab caches for + normal kmalloc allocation and makes kmalloc randomly pick one based + on code address, which makes the attackers more difficult to spray + vulnerable memory objects on the heap for the purpose of exploiting + memory vulnerabilities. + + Currently the number of copies is set to 16, a reasonably large value + that effectively diverges the memory objects allocated for different + subsystems or modules into different caches, at the expense of a + limited degree of memory and CPU overhead that relates to hardware and + system workload. + endmenu # SLAB allocator options config SHUFFLE_PAGE_ALLOCATOR diff --git a/mm/Makefile b/mm/Makefile index e6d9a1d5e84d..ec65984e2ade 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o show_mem.o\ + compaction.o show_mem.o shmem_quota.o\ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -267,6 +267,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (alignment && !is_power_of_2(alignment)) return -EINVAL; + if (!IS_ENABLED(CONFIG_NUMA)) + nid = NUMA_NO_NODE; + /* Sanitise input arguments. */ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { @@ -372,14 +375,15 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (ret) goto free_mem; - pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, - &base); + pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M, + &base, nid); return 0; free_mem: memblock_phys_free(base, size); err: - pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M, + nid); return ret; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index d61eaa075c75..48e329ea5ba3 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -109,10 +109,10 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pte_same(pte, pte)); WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); - WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte)))); + WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte), args->vma))); WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); - WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte, args->vma)))); WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte)))); WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); } @@ -156,7 +156,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) pte = pte_mkclean(pte); set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); - pte = pte_mkwrite(pte); + pte = pte_mkwrite(pte, args->vma); pte = pte_mkdirty(pte); ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); pte = ptep_get(args->ptep); @@ -202,10 +202,10 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pmd_same(pmd, pmd)); WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); - WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd), args->vma))); WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); - WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd, args->vma)))); WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd)))); WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd)))); /* @@ -256,7 +256,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_mkclean(pmd); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); flush_dcache_page(page); - pmd = pmd_mkwrite(pmd); + pmd = pmd_mkwrite(pmd, args->vma); pmd = pmd_mkdirty(pmd); pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); pmd = READ_ONCE(*args->pmdp); diff --git a/mm/filemap.c b/mm/filemap.c index 014b73eb96a1..bf6219d9aaac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1856,30 +1856,15 @@ out: * * Looks up the page cache entry at @mapping & @index. * - * @fgp_flags can be zero or more of these flags: - * - * * %FGP_ACCESSED - The folio will be marked accessed. - * * %FGP_LOCK - The folio is returned locked. - * * %FGP_CREAT - If no page is present then a new page is allocated using - * @gfp and added to the page cache and the VM's LRU list. - * The page is returned locked and with an increased refcount. - * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the - * page is already in cache. If the page was allocated, unlock it before - * returning so the caller can do the same dance. - * * %FGP_WRITE - The page will be written to by the caller. - * * %FGP_NOFS - __GFP_FS will get cleared in gfp. - * * %FGP_NOWAIT - Don't get blocked by page lock. - * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) - * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * - * If there is a page cache page, it is returned with an increased refcount. + * If this function returns a folio, it is returned with an increased refcount. * * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; @@ -1921,7 +1906,9 @@ repeat: folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { + unsigned order = FGF_GET_ORDER(fgp_flags); int err; + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) @@ -1930,26 +1917,44 @@ no_page: gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } - - folio = filemap_alloc_folio(gfp, 0); - if (!folio) - return ERR_PTR(-ENOMEM); - if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoid atomic mark_page_accessed later */ - if (fgp_flags & FGP_ACCESSED) - __folio_set_referenced(folio); + if (!mapping_large_folio_support(mapping)) + order = 0; + if (order > MAX_PAGECACHE_ORDER) + order = MAX_PAGECACHE_ORDER; + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); - err = filemap_add_folio(mapping, folio, index, gfp); - if (unlikely(err)) { + do { + gfp_t alloc_gfp = gfp; + + err = -ENOMEM; + if (order == 1) + order = 0; + if (order > 0) + alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; + folio = filemap_alloc_folio(alloc_gfp, order); + if (!folio) + continue; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __folio_set_referenced(folio); + + err = filemap_add_folio(mapping, folio, index, gfp); + if (!err) + break; folio_put(folio); folio = NULL; - if (err == -EEXIST) - goto repeat; - } + } while (order-- > 0); + if (err == -EEXIST) + goto repeat; + if (err) + return ERR_PTR(err); /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. diff --git a/mm/folio-compat.c b/mm/folio-compat.c index c6f056c20503..10c3247542cb 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp) + fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; @@ -1051,7 +1051,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) !writable_file_mapping_allowed(vma, gup_flags)) return -EFAULT; - if (!(vm_flags & VM_WRITE)) { + if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c4635f750255..064fbd90822b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -551,7 +551,7 @@ __setup("transparent_hugepage=", setup_transparent_hugepage); pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) - pmd = pmd_mkwrite(pmd); + pmd = pmd_mkwrite(pmd, vma); return pmd; } @@ -1566,7 +1566,7 @@ out_map: pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (writable) - pmd = pmd_mkwrite(pmd); + pmd = pmd_mkwrite(pmd, vma); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); @@ -1675,6 +1675,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, tlb->fullmm); + arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) @@ -1919,7 +1920,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && can_change_pmd_writable(vma, addr, entry)) - entry = pmd_mkwrite(entry); + entry = pmd_mkwrite(entry, vma); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); @@ -2233,7 +2234,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); if (write) - entry = pte_mkwrite(entry); + entry = pte_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); if (!young) @@ -2501,7 +2502,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); - int i; + int i, nr_dropped = 0; /* complete memcg works before add pages to LRU */ split_page_memcg(head, nr); @@ -2524,7 +2525,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct folio *tail = page_folio(head + i); if (shmem_mapping(head->mapping)) - shmem_uncharge(head->mapping->host, 1); + nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); @@ -2561,6 +2562,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); + if (nr_dropped) + shmem_uncharge(head->mapping->host, nr_dropped); remap_page(folio, nr); if (folio_test_swapcache(folio)) @@ -3263,7 +3266,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) - pmde = pmd_mkwrite(pmde); + pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) diff --git a/mm/internal.h b/mm/internal.h index d1d4bf4e63c0..30cf724ddbce 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -556,14 +556,14 @@ static inline bool is_exec_mapping(vm_flags_t flags) } /* - * Stack area - automatically grows in one direction + * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { - return (flags & VM_STACK) == VM_STACK; + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 9e008a336d9f..95b2b84c296d 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -212,7 +212,9 @@ static void test_cache_destroy(void) static inline size_t kmalloc_cache_alignment(size_t size) { - return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align; + /* just to get ->align so no need to pass in the real caller */ + enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, 0); + return kmalloc_caches[type][__kmalloc_index(size, false)]->align; } /* Must always inline to match stack trace against caller. */ @@ -282,8 +284,9 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat if (is_kfence_address(alloc)) { struct slab *slab = virt_to_slab(alloc); + enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, _RET_IP_); struct kmem_cache *s = test_cache ?: - kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]; + kmalloc_caches[type][__kmalloc_index(size, false)]; /* * Verify that various helpers return the right values diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 197430a5be4a..c509aed326ce 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -13,6 +13,7 @@ #include <linux/printk.h> #include <linux/sched/debug.h> #include <linux/seq_file.h> +#include <linux/sprintf.h> #include <linux/stacktrace.h> #include <linux/string.h> #include <trace/events/error_report.h> @@ -26,8 +27,6 @@ #define ARCH_FUNC_PREFIX "" #endif -extern bool no_hash_pointers; - /* Helper function to either print to a seq_file or to console. */ __printf(2, 3) static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d5650541083a..88433cc25d8a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1848,10 +1848,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto xa_locked; } } - if (!shmem_charge(mapping->host, 1)) { - result = SCAN_FAIL; - goto xa_locked; - } nr_none++; continue; } @@ -2037,8 +2033,13 @@ xa_unlocked: */ try_to_unmap_flush(); - if (result != SCAN_SUCCEED) + if (result == SCAN_SUCCEED && nr_none && + !shmem_charge(mapping->host, nr_none)) + result = SCAN_FAIL; + if (result != SCAN_SUCCEED) { + nr_none = 0; goto rollback; + } /* * The old pages are locked, so they won't change anymore. @@ -2177,8 +2178,8 @@ rollback: if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; - shmem_uncharge(mapping->host, nr_none); xas_unlock_irq(&xas); + shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(page, tmp, &pagelist, lru) { diff --git a/mm/memory.c b/mm/memory.c index 00a5ce113090..6c264d2f969c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1430,6 +1430,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); @@ -4124,7 +4125,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkwrite(pte_mkdirty(entry), vma); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); @@ -4842,7 +4843,7 @@ out_map: pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) - pte = pte_mkwrite(pte); + pte = pte_mkwrite(pte, vma); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -5307,11 +5308,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault); static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { - /* Even if this succeeds, make it clear we *might* have slept */ - if (likely(mmap_read_trylock(mm))) { - might_sleep(); + if (likely(mmap_read_trylock(mm))) return true; - } if (regs && !user_mode(regs)) { unsigned long ip = instruction_pointer(regs); @@ -5744,6 +5742,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, if (mmap_read_lock_killable(mm)) return 0; + /* Untag the address before looking up the VMA */ + addr = untagged_addr_remote(mm, addr); + /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; diff --git a/mm/migrate.c b/mm/migrate.c index e21d5a7e7447..b7fa020003f3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -220,7 +220,7 @@ static bool remove_migration_pte(struct folio *folio, if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) - pte = pte_mkwrite(pte); + pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); @@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst, } EXPORT_SYMBOL(migrate_folio); -#ifdef CONFIG_BLOCK +#ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) @@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, return __buffer_migrate_folio(mapping, dst, src, mode, true); } EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); -#endif +#endif /* CONFIG_BUFFER_HEAD */ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index d69131adc51c..8ac1f79f754a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -624,7 +624,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, } entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkwrite(pte_mkdirty(entry), vma); } ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); diff --git a/mm/mmap.c b/mm/mmap.c index 514ced13c65c..b56a7f0c9f85 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1182,11 +1182,11 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff, - unsigned long *populate, struct list_head *uf) + unsigned long flags, vm_flags_t vm_flags, + unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; int pkey = 0; *populate = 0; @@ -1246,7 +1246,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -1564,7 +1564,7 @@ retry: gap = mas.index; gap += (info->align_offset - gap) & info->align_mask; tmp = mas_next(&mas, ULONG_MAX); - if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; mas_reset(&mas); @@ -1616,7 +1616,7 @@ retry: gap -= (gap - info->align_offset) & info->align_mask; gap_end = mas.last; tmp = mas_next(&mas, ULONG_MAX); - if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) <= gap_end) { high_limit = vm_start_gap(tmp); mas_reset(&mas); @@ -2998,7 +2998,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, file = get_file(vma->vm_file); ret = do_mmap(vma->vm_file, start, size, - prot, flags, pgoff, &populate, NULL); + prot, flags, 0, pgoff, &populate, NULL); fput(file); out: mmap_write_unlock(mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 130db91d3a8c..b94fbb45d5c7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,7 +185,7 @@ static long change_pte_range(struct mmu_gather *tlb, if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pte_write(ptent) && can_change_pte_writable(vma, addr, ptent)) - ptent = pte_mkwrite(ptent); + ptent = pte_mkwrite(ptent, vma); ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); if (pte_needs_flush(oldpte, ptent)) diff --git a/mm/nommu.c b/mm/nommu.c index 8dba41cfc44d..7f9e9e5a0e12 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1016,6 +1016,7 @@ unsigned long do_mmap(struct file *file, unsigned long len, unsigned long prot, unsigned long flags, + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) @@ -1023,7 +1024,6 @@ unsigned long do_mmap(struct file *file, struct vm_area_struct *vma; struct vm_region *region; struct rb_node *rb; - vm_flags_t vm_flags; unsigned long capabilities, result; int ret; VMA_ITERATOR(vmi, current->mm, 0); @@ -1043,7 +1043,7 @@ unsigned long do_mmap(struct file *file, /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ - vm_flags = determine_vm_flags(file, prot, flags, capabilities); + vm_flags |= determine_vm_flags(file, prot, flags, capabilities); /* we're going to need to record the mapping */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d3f42009bb70..b8d3d7040a50 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1193,7 +1193,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, * write_bandwidth = --------------------------------------------------- * period * - * @written may have decreased due to folio_account_redirty(). + * @written may have decreased due to folio_redirty_for_writepage(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); @@ -2712,37 +2712,6 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) EXPORT_SYMBOL(filemap_dirty_folio); /** - * folio_account_redirty - Manually account for redirtying a page. - * @folio: The folio which is being redirtied. - * - * Most filesystems should call folio_redirty_for_writepage() instead - * of this fuction. If your filesystem is doing writeback outside the - * context of a writeback_control(), it can call this when redirtying - * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED, - * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN, - * WB_WRITTEN) in long term. The mismatches will lead to systematic errors - * in balanced_dirty_ratelimit and the dirty pages position control. - */ -void folio_account_redirty(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - - if (mapping && mapping_can_writeback(mapping)) { - struct inode *inode = mapping->host; - struct bdi_writeback *wb; - struct wb_lock_cookie cookie = {}; - long nr = folio_nr_pages(folio); - - wb = unlocked_inode_to_wb_begin(inode, &cookie); - current->nr_dirtied -= nr; - node_stat_mod_folio(folio, NR_DIRTIED, -nr); - wb_stat_mod(wb, WB_DIRTIED, -nr); - unlocked_inode_to_wb_end(inode, &cookie); - } -} -EXPORT_SYMBOL(folio_account_redirty); - -/** * folio_redirty_for_writepage - Decline to write a dirty folio. * @wbc: The writeback control. * @folio: The folio. @@ -2757,13 +2726,23 @@ EXPORT_SYMBOL(folio_account_redirty); bool folio_redirty_for_writepage(struct writeback_control *wbc, struct folio *folio) { - bool ret; + struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); + bool ret; wbc->pages_skipped += nr; - ret = filemap_dirty_folio(folio->mapping, folio); - folio_account_redirty(folio); + ret = filemap_dirty_folio(mapping, folio); + if (mapping && mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + wb = unlocked_inode_to_wb_begin(inode, &cookie); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); + unlocked_inode_to_wb_end(inode, &cookie); + } return ret; } EXPORT_SYMBOL(folio_redirty_for_writepage); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b2d23fbf4d3..b7d7e4fcfad7 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_offset_map(pmd, addr); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); - if (walk->mm != &init_mm) + if (walk->mm != &init_mm && addr < TASK_SIZE) pte_unmap(pte); } } else { diff --git a/mm/percpu.c b/mm/percpu.c index 28e07ede46f6..a7665de8485f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1890,13 +1890,15 @@ fail_unlock: fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (!is_atomic && do_warn && warn_limit) { + if (do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); - dump_stack(); + if (!is_atomic) + dump_stack(); if (!--warn_limit) pr_info("limit reached, disable warning\n"); } + if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; @@ -2581,14 +2583,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; - struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; - int map_size; unsigned long tmp_addr; size_t alloc_size; @@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); - PCPU_SETUP_BUG_ON(!ai->dyn_size); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); @@ -2698,7 +2697,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; - pcpu_chunk_struct_size = struct_size(chunk, populated, + pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); @@ -2735,29 +2734,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dyn_size = ai->dyn_size - (static_size - ai->static_size); /* - * Initialize first chunk. - * If the reserved_size is non-zero, this initializes the reserved - * chunk. If the reserved_size is zero, the reserved chunk is NULL - * and the dynamic region is initialized here. The first chunk, - * pcpu_first_chunk, will always point to the chunk that serves - * the dynamic region. + * Initialize first chunk: + * This chunk is broken up into 3 parts: + * < static | [reserved] | dynamic > + * - static - there is no backing chunk because these allocations can + * never be freed. + * - reserved (pcpu_reserved_chunk) - exists primarily to serve + * allocations from module load. + * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first + * chunk. */ tmp_addr = (unsigned long)base_addr + static_size; - map_size = ai->reserved_size ?: dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); - - /* init dynamic chunk if necessary */ - if (ai->reserved_size) { - pcpu_reserved_chunk = chunk; + if (ai->reserved_size) + pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr, + ai->reserved_size); + tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; + pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size); - tmp_addr = (unsigned long)base_addr + static_size + - ai->reserved_size; - map_size = dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); - } - - /* link the first chunk in */ - pcpu_first_chunk = chunk; pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -3189,32 +3182,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d_t *new; - - new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!new) + p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!p4d) goto err_alloc; - pgd_populate(&init_mm, pgd, new); + pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!new) + pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!pud) goto err_alloc; - p4d_populate(&init_mm, p4d, new); + p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!new) + pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!pmd) goto err_alloc; - pud_populate(&init_mm, pud, new); + pud_populate(&init_mm, pud, pmd); } pmd = pmd_offset(pud, addr); diff --git a/mm/readahead.c b/mm/readahead.c index a9c999aa19af..e815c114de21 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -461,19 +461,6 @@ static int try_context_readahead(struct address_space *mapping, return 1; } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { diff --git a/mm/shmem.c b/mm/shmem.c index 980289be5f63..02e62fccc80d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -78,6 +78,7 @@ static struct vfsmount *shm_mnt; #include <uapi/linux/memfd.h> #include <linux/rmap.h> #include <linux/uuid.h> +#include <linux/quotaops.h> #include <linux/uaccess.h> @@ -89,6 +90,9 @@ static struct vfsmount *shm_mnt; /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Pretend that one inode + its dentry occupy this much memory */ +#define BOGO_INODE_SIZE 1024 + /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 @@ -116,11 +120,14 @@ struct shmem_options { int huge; int seen; bool noswap; + unsigned short quota_types; + struct shmem_quota_limits qlimits; #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_NOSWAP 16 +#define SHMEM_SEEN_QUOTA 32 }; #ifdef CONFIG_TMPFS @@ -133,7 +140,8 @@ static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); - return min(nr_pages - totalhigh_pages(), nr_pages / 2); + return min3(nr_pages - totalhigh_pages(), nr_pages / 2, + ULONG_MAX / BOGO_INODE_SIZE); } #endif @@ -199,33 +207,47 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +static int shmem_inode_acct_block(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + int err = -ENOSPC; if (shmem_acct_block(info->flags, pages)) - return false; + return err; + might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks - pages) > 0) goto unacct; + + err = dquot_alloc_block_nodirty(inode, pages); + if (err) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } else { + err = dquot_alloc_block_nodirty(inode, pages); + if (err) + goto unacct; } - return true; + return 0; unacct: shmem_unacct_blocks(info->flags, pages); - return false; + return err; } -static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + might_sleep(); /* when quotas */ + dquot_free_block_nodirty(inode, pages); + if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); @@ -254,6 +276,47 @@ bool vma_is_shmem(struct vm_area_struct *vma) static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); +#ifdef CONFIG_TMPFS_QUOTA + +static int shmem_enable_quotas(struct super_block *sb, + unsigned short quota_types) +{ + int type, err = 0; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < SHMEM_MAXQUOTAS; type++) { + if (!(quota_types & (1 << type))) + continue; + err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, + DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (err) + goto out_err; + } + return 0; + +out_err: + pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", + type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; +} + +static void shmem_disable_quotas(struct super_block *sb) +{ + int type; + + for (type = 0; type < SHMEM_MAXQUOTAS; type++) + dquot_quota_off(sb, type); +} + +static struct dquot **shmem_get_dquots(struct inode *inode) +{ + return SHMEM_I(inode)->i_dquot; +} +#endif /* CONFIG_TMPFS_QUOTA */ + /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. @@ -272,11 +335,11 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { - if (!sbinfo->free_inodes) { + if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } - sbinfo->free_inodes--; + sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; @@ -330,12 +393,12 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) return 0; } -static void shmem_free_inode(struct super_block *sb) +static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; + sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } @@ -343,62 +406,65 @@ static void shmem_free_inode(struct super_block *sb) /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc + * @alloced: the change in number of pages allocated to inode + * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) - * - * It has to be called with the spinlock held. */ -static void shmem_recalc_inode(struct inode *inode) +static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; - freed = info->alloced - info->swapped - inode->i_mapping->nrpages; - if (freed > 0) { + spin_lock(&info->lock); + info->alloced += alloced; + info->swapped += swapped; + freed = info->alloced - info->swapped - + READ_ONCE(inode->i_mapping->nrpages); + /* + * Special case: whereas normally shmem_recalc_inode() is called + * after i_mapping->nrpages has already been adjusted (up or down), + * shmem_writepage() has to raise swapped before nrpages is lowered - + * to stop a racing shmem_recalc_inode() from thinking that a page has + * been freed. Compensate here, to avoid the need for a followup call. + */ + if (swapped > 0) + freed += swapped; + if (freed > 0) info->alloced -= freed; - inode->i_blocks -= freed * BLOCKS_PER_PAGE; + spin_unlock(&info->lock); + + /* The quota case may block */ + if (freed > 0) shmem_inode_unacct_blocks(inode, freed); - } } bool shmem_charge(struct inode *inode, long pages) { - struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long flags; + struct address_space *mapping = inode->i_mapping; - if (!shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_block(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ - inode->i_mapping->nrpages += pages; - - spin_lock_irqsave(&info->lock, flags); - info->alloced += pages; - inode->i_blocks += pages * BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); + xa_lock_irq(&mapping->i_pages); + mapping->nrpages += pages; + xa_unlock_irq(&mapping->i_pages); + shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { - struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long flags; - + /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ - spin_lock_irqsave(&info->lock, flags); - info->alloced -= pages; - inode->i_blocks -= pages * BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); - - shmem_inode_unacct_blocks(inode, pages); + shmem_recalc_inode(inode, 0, 0); } /* @@ -1040,16 +1106,13 @@ whole_folios: folio_batch_release(&fbatch); } - spin_lock_irq(&info->lock); - info->swapped -= nr_swaps_freed; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1061,11 +1124,9 @@ static int shmem_getattr(struct mnt_idmap *idmap, struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - if (info->alloced - info->swapped != inode->i_mapping->nrpages) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - } + if (info->alloced - info->swapped != inode->i_mapping->nrpages) + shmem_recalc_inode(inode, 0, 0); + if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) @@ -1075,7 +1136,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); if (shmem_is_huge(inode, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; @@ -1142,13 +1203,28 @@ static int shmem_setattr(struct mnt_idmap *idmap, } } + if (is_quota_modification(idmap, inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } + + /* Transfer quota accounting */ + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { + error = dquot_transfer(idmap, inode, attr); + + if (error) + return error; + } + setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (update_mtime) - inode->i_mtime = inode->i_ctime; + inode->i_mtime = inode_get_ctime(inode); inode_inc_iversion(inode); } return error; @@ -1158,6 +1234,7 @@ static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); @@ -1184,10 +1261,14 @@ static void shmem_evict_inode(struct inode *inode) } } - simple_xattrs_free(&info->xattrs); + simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); - shmem_free_inode(inode->i_sb); clear_inode(inode); +#ifdef CONFIG_TMPFS_QUOTA + dquot_free_inode(inode); + dquot_drop(inode); +#endif } static int shmem_find_swap_entries(struct address_space *mapping, @@ -1431,11 +1512,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - info->swapped++; - spin_unlock_irq(&info->lock); - + shmem_recalc_inode(inode, 0, 1); swap_shmem_alloc(swap); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); @@ -1590,13 +1667,14 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; int nr; - int err = -ENOSPC; + int err; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; - if (!shmem_inode_acct_block(inode, nr)) + err = shmem_inode_acct_block(inode, nr); + if (err) goto failed; if (huge) @@ -1705,7 +1783,6 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swapin_error; void *old; @@ -1718,16 +1795,12 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, folio_wait_writeback(folio); delete_from_swap_cache(folio); - spin_lock_irq(&info->lock); /* - * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't - * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in - * shmem_evict_inode. + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks + * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) + * in shmem_evict_inode(). */ - info->alloced--; - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, -1, -1); swap_free(swap); } @@ -1814,10 +1887,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (error) goto failed; - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, -1); if (sgp == SGP_WRITE) folio_mark_accessed(folio); @@ -1982,13 +2052,9 @@ alloc_nohuge: charge_mm); if (error) goto unacct; - folio_add_lru(folio); - spin_lock_irq(&info->lock); - info->alloced += folio_nr_pages(folio); - inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + folio_add_lru(folio); + shmem_recalc_inode(inode, folio_nr_pages(folio), 0); alloced = true; if (folio_test_pmd_mappable(folio) && @@ -2037,9 +2103,7 @@ clear: if (alloced) { folio_clear_dirty(folio); filemap_remove_folio(folio); - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, 0); } error = -EINVAL; goto unlock; @@ -2065,9 +2129,7 @@ unlock: folio_put(folio); } if (error == -ENOSPC && !once++) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, 0); goto repeat; } if (error == -EEXIST) @@ -2328,6 +2390,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int shmem_file_open(struct inode *inode, struct file *file) +{ + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(inode, file); +} + #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); @@ -2357,78 +2425,128 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) #define shmem_initxattrs NULL #endif -static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, - struct inode *dir, umode_t mode, dev_t dev, - unsigned long flags) +static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) +{ + return &SHMEM_I(inode)->dir_offsets; +} + +static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, + struct inode *dir, umode_t mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; + int err; + + err = shmem_reserve_inode(sb, &ino); + if (err) + return ERR_PTR(err); - if (shmem_reserve_inode(sb, &ino)) - return NULL; inode = new_inode(sb); - if (inode) { - inode->i_ino = ino; - inode_init_owner(idmap, inode, dir, mode); - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = get_random_u32(); - info = SHMEM_I(inode); - memset(info, 0, (char *)inode - (char *)info); - spin_lock_init(&info->lock); - atomic_set(&info->stop_eviction, 0); - info->seals = F_SEAL_SEAL; - info->flags = flags & VM_NORESERVE; - info->i_crtime = inode->i_mtime; - info->fsflags = (dir == NULL) ? 0 : - SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; - if (info->fsflags) - shmem_set_inode_flags(inode, info->fsflags); - INIT_LIST_HEAD(&info->shrinklist); - INIT_LIST_HEAD(&info->swaplist); - if (sbinfo->noswap) - mapping_set_unevictable(inode->i_mapping); - simple_xattrs_init(&info->xattrs); - cache_no_acl(inode); - mapping_set_large_folios(inode->i_mapping); - - switch (mode & S_IFMT) { - default: - inode->i_op = &shmem_special_inode_operations; - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - inode->i_mapping->a_ops = &shmem_aops; - inode->i_op = &shmem_inode_operations; - inode->i_fop = &shmem_file_operations; - mpol_shared_policy_init(&info->policy, - shmem_get_sbmpol(sbinfo)); - break; - case S_IFDIR: - inc_nlink(inode); - /* Some things misbehave if size == 0 on a directory */ - inode->i_size = 2 * BOGO_DIRENT_SIZE; - inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - break; - case S_IFLNK: - /* - * Must not load anything in the rbtree, - * mpol_free_shared_policy will not be called. - */ - mpol_shared_policy_init(&info->policy, NULL); - break; - } + if (!inode) { + shmem_free_inode(sb, 0); + return ERR_PTR(-ENOSPC); + } - lockdep_annotate_inode_mutex_key(inode); - } else - shmem_free_inode(sb); + inode->i_ino = ino; + inode_init_owner(idmap, inode, dir, mode); + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + inode->i_generation = get_random_u32(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); + info->seals = F_SEAL_SEAL; + info->flags = flags & VM_NORESERVE; + info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); + INIT_LIST_HEAD(&info->shrinklist); + INIT_LIST_HEAD(&info->swaplist); + INIT_LIST_HEAD(&info->swaplist); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); + mapping_set_large_folios(inode->i_mapping); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_offset_dir_operations; + simple_offset_init(shmem_get_offset_ctx(inode)); + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + + lockdep_annotate_inode_mutex_key(inode); return inode; } +#ifdef CONFIG_TMPFS_QUOTA +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + int err; + struct inode *inode; + + inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); + if (IS_ERR(inode)) + return inode; + + err = dquot_initialize(inode); + if (err) + goto errout; + + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); + goto errout; + } + return inode; + +errout: + inode->i_flags |= S_NOQUOTA; + iput(inode); + return ERR_PTR(err); +} +#else +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); +} +#endif /* CONFIG_TMPFS_QUOTA */ + #ifdef CONFIG_USERFAULTFD int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -2447,7 +2565,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, int ret; pgoff_t max_off; - if (!shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_block(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to @@ -2529,12 +2647,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (ret) goto out_delete_from_cache; - spin_lock_irq(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - + shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: @@ -2733,6 +2846,28 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } +static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + ret = file_remove_privs(file); + if (ret) + goto unlock; + ret = file_update_time(file); + if (ret) + goto unlock; + ret = generic_perform_write(iocb, from); +unlock: + inode_unlock(inode); + return ret; +} + static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -3057,7 +3192,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; - buf->f_ffree = sbinfo->free_inodes; + buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ @@ -3074,27 +3209,32 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; + int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); - if (inode) { - error = simple_acl_create(dir, inode); - if (error) - goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, - shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) - goto out_iput; + if (IS_ERR(inode)) + return PTR_ERR(inode); - error = 0; - dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = current_time(dir); - inode_inc_iversion(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - } + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + error = security_inode_init_security(inode, dir, + &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; + + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_mtime = inode_set_ctime_current(dir); + inode_inc_iversion(dir); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ return error; + out_iput: iput(inode); return error; @@ -3105,20 +3245,26 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; - int error = -ENOSPC; + int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (inode) { - error = security_inode_init_security(inode, dir, - NULL, - shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) - goto out_iput; - error = simple_acl_create(dir, inode); - if (error) - goto out_iput; - d_tmpfile(file, inode); + + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; } + + error = security_inode_init_security(inode, dir, + NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(file, inode); + +err_out: return finish_open_simple(file, error); out_iput: iput(inode); @@ -3164,8 +3310,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr goto out; } + ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (ret) { + if (inode->i_nlink) + shmem_free_inode(inode->i_sb, 0); + goto out; + } + dir->i_size += BOGO_DIRENT_SIZE; - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ @@ -3180,10 +3334,13 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) - shmem_free_inode(inode->i_sb); + shmem_free_inode(inode->i_sb, 0); + + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ @@ -3240,24 +3397,29 @@ static int shmem_rename2(struct mnt_idmap *idmap, { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + return simple_offset_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { - int error; - error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } + simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); + error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + if (error) + return error; + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { @@ -3271,9 +3433,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - inode->i_ctime = current_time(old_dir); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; @@ -3293,31 +3453,32 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (!inode) - return -ENOSPC; + + if (IS_ERR(inode)) + return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) { - iput(inode); - return error; - } + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { inode->i_link = kmemdup(symname, len, GFP_KERNEL); if (!inode->i_link) { - iput(inode); - return -ENOMEM; + error = -ENOMEM; + goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); - if (error) { - iput(inode); - return error; - } + if (error) + goto out_remove_offset; inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); @@ -3327,11 +3488,17 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; + +out_remove_offset: + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); +out_iput: + iput(inode); + return error; } static void shmem_put_link(void *arg) @@ -3399,7 +3566,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, (fa->flags & SHMEM_FL_USER_MODIFIABLE); shmem_set_inode_flags(inode, info->fsflags); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } @@ -3419,21 +3586,40 @@ static int shmem_initxattrs(struct inode *inode, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; + size_t ispace = 0; size_t len; + if (sbinfo->max_inodes) { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + ispace += simple_xattr_space(xattr->name, + xattr->value_len + XATTR_SECURITY_PREFIX_LEN); + } + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) - return -ENOMEM; + break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); - return -ENOMEM; + break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, @@ -3444,6 +3630,16 @@ static int shmem_initxattrs(struct inode *inode, simple_xattr_add(&info->xattrs, new_xattr); } + if (xattr->name != NULL) { + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } + simple_xattrs_free(&info->xattrs, NULL); + return -ENOMEM; + } + return 0; } @@ -3464,15 +3660,40 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); - int err; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct simple_xattr *old_xattr; + size_t ispace = 0; name = xattr_full_name(handler, name); - err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); - if (!err) { - inode->i_ctime = current_time(inode); + if (value && sbinfo->max_inodes) { + ispace = simple_xattr_space(name, size); + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + + old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + if (!IS_ERR(old_xattr)) { + ispace = 0; + if (old_xattr && sbinfo->max_inodes) + ispace = simple_xattr_space(old_xattr->name, + old_xattr->size); + simple_xattr_free(old_xattr); + old_xattr = NULL; + inode_set_ctime_current(inode); inode_inc_iversion(inode); } - return err; + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } + return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3487,9 +3708,16 @@ static const struct xattr_handler shmem_trusted_xattr_handler = { .set = shmem_xattr_handler_set, }; +static const struct xattr_handler shmem_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; + static const struct xattr_handler *shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, + &shmem_user_xattr_handler, NULL }; @@ -3502,6 +3730,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, + .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3510,6 +3739,7 @@ static const struct inode_operations shmem_short_symlink_operations = { static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, + .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3609,6 +3839,13 @@ enum shmem_param { Opt_inode32, Opt_inode64, Opt_noswap, + Opt_quota, + Opt_usrquota, + Opt_grpquota, + Opt_usrquota_block_hardlimit, + Opt_usrquota_inode_hardlimit, + Opt_grpquota_block_hardlimit, + Opt_grpquota_inode_hardlimit, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3631,6 +3868,15 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), +#ifdef CONFIG_TMPFS_QUOTA + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), + fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), + fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), + fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), +#endif {} }; @@ -3641,6 +3887,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) unsigned long long size; char *rest; int opt; + kuid_t kuid; + kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) @@ -3662,13 +3910,13 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest || ctx->blocks > S64_MAX) + if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; @@ -3676,14 +3924,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) + kuid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(kuid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; + + ctx->uid = kuid; break; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) + kgid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(kgid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; + + ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; @@ -3722,6 +3988,60 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->noswap = true; ctx->seen |= SHMEM_SEEN_NOSWAP; break; + case Opt_quota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); + break; + case Opt_usrquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_USR; + break; + case Opt_grpquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_GRP; + break; + case Opt_usrquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "User quota block hardlimit too large."); + ctx->qlimits.usrquota_bhardlimit = size; + break; + case Opt_grpquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "Group quota block hardlimit too large."); + ctx->qlimits.grpquota_bhardlimit = size; + break; + case Opt_usrquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "User quota inode hardlimit too large."); + ctx->qlimits.usrquota_ihardlimit = size; + break; + case Opt_grpquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "Group quota inode hardlimit too large."); + ctx->qlimits.grpquota_ihardlimit = size; + break; } return 0; @@ -3777,21 +4097,17 @@ static int shmem_parse_options(struct fs_context *fc, void *data) /* * Reconfigure a shmem filesystem. - * - * Note that we disallow change from limited->unlimited blocks/inodes while any - * are in use; but we must separately disallow unlimited->limited, because in - * that case we have no record of how much is already in use. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); - unsigned long inodes; + unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); - inodes = sbinfo->max_inodes - sbinfo->free_inodes; + used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3809,7 +4125,7 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Cannot retroactively limit inodes"; goto out; } - if (ctx->inodes < inodes) { + if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } @@ -3829,6 +4145,24 @@ static int shmem_reconfigure(struct fs_context *fc) goto out; } + if (ctx->seen & SHMEM_SEEN_QUOTA && + !sb_any_quota_loaded(fc->root->d_sb)) { + err = "Cannot enable quota on remount"; + goto out; + } + +#ifdef CONFIG_TMPFS_QUOTA +#define CHANGED_LIMIT(name) \ + (ctx->qlimits.name## hardlimit && \ + (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) + + if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || + CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { + err = "Cannot change global quota limit on remount"; + goto out; + } +#endif /* CONFIG_TMPFS_QUOTA */ + if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) @@ -3837,7 +4171,7 @@ static int shmem_reconfigure(struct fs_context *fc) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; - sbinfo->free_inodes = ctx->inodes - inodes; + sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* @@ -3919,6 +4253,9 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#ifdef CONFIG_TMPFS_QUOTA + shmem_disable_quotas(sb); +#endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); @@ -3931,12 +4268,13 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; + int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) - return -ENOMEM; + return error; sb->s_fs_info = sbinfo; @@ -3963,7 +4301,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; #endif sbinfo->max_blocks = ctx->blocks; - sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) @@ -3997,10 +4336,27 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #endif uuid_gen(&sb->s_uuid); +#ifdef CONFIG_TMPFS_QUOTA + if (ctx->seen & SHMEM_SEEN_QUOTA) { + sb->dq_op = &shmem_quota_operations; + sb->s_qcop = &dquot_quotactl_sysfile_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + + /* Copy the default limits from ctx into sbinfo */ + memcpy(&sbinfo->qlimits, &ctx->qlimits, + sizeof(struct shmem_quota_limits)); + + if (shmem_enable_quotas(sb, ctx->quota_types)) + goto failed; + } +#endif /* CONFIG_TMPFS_QUOTA */ + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); - if (!inode) + if (IS_ERR(inode)) { + error = PTR_ERR(inode); goto failed; + } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); @@ -4010,7 +4366,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return -ENOMEM; + return error; } static int shmem_get_tree(struct fs_context *fc) @@ -4060,6 +4416,8 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); + if (S_ISDIR(inode->i_mode)) + simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) @@ -4103,12 +4461,12 @@ EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, - .open = generic_file_open, + .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, - .write_iter = generic_file_write_iter, + .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, @@ -4140,6 +4498,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, + .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -4171,6 +4530,9 @@ static const struct super_operations shmem_ops = { .statfs = shmem_statfs, .show_options = shmem_show_options, #endif +#ifdef CONFIG_TMPFS_QUOTA + .get_dquots = shmem_get_dquots, +#endif .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, @@ -4224,7 +4586,7 @@ static struct file_system_type shmem_fs_type = { #endif .kill_sb = kill_litter_super, #ifdef CONFIG_SHMEM - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, #else .fs_flags = FS_USERNS_MOUNT, #endif @@ -4236,6 +4598,14 @@ void __init shmem_init(void) shmem_init_inodecache(); +#ifdef CONFIG_TMPFS_QUOTA + error = register_quota_format(&shmem_quota_format); + if (error < 0) { + pr_err("Could not register quota format\n"); + goto out3; + } +#endif + error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); @@ -4260,6 +4630,10 @@ void __init shmem_init(void) out1: unregister_filesystem(&shmem_fs_type); out2: +#ifdef CONFIG_TMPFS_QUOTA + unregister_quota_format(&shmem_quota_format); +out3: +#endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } @@ -4379,10 +4753,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); + return inode ? inode : ERR_PTR(-ENOSPC); +} + #endif /* CONFIG_SHMEM */ /* common code */ @@ -4407,9 +4787,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); - if (unlikely(!inode)) { + + if (IS_ERR(inode)) { shmem_unacct_size(flags, size); - return ERR_PTR(-ENOSPC); + return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c new file mode 100644 index 000000000000..062d1c1097ae --- /dev/null +++ b/mm/shmem_quota.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * In memory quota format relies on quota infrastructure to store dquot + * information for us. While conventional quota formats for file systems + * with persistent storage can load quota information into dquot from the + * storage on-demand and hence quota dquot shrinker can free any dquot + * that is not currently being used, it must be avoided here. Otherwise we + * can lose valuable information, user provided limits, because there is + * no persistent storage to load the information from afterwards. + * + * One information that in-memory quota format needs to keep track of is + * a sorted list of ids for each quota type. This is done by utilizing + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota + * type. + * + * This format can be used to support quota on file system without persistent + * storage such as tmpfs. + * + * Author: Lukas Czerner <lczerner@redhat.com> + * Carlos Maiolino <cmaiolino@redhat.com> + * + * Copyright (C) 2023 Red Hat, Inc. + */ +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/shmem_fs.h> + +#include <linux/quotaops.h> +#include <linux/quota.h> + +#ifdef CONFIG_TMPFS_QUOTA + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define SHMEM_MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define SHMEM_MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +struct quota_id { + struct rb_node node; + qid_t id; + qsize_t bhardlimit; + qsize_t bsoftlimit; + qsize_t ihardlimit; + qsize_t isoftlimit; +}; + +static int shmem_check_quota_file(struct super_block *sb, int type) +{ + /* There is no real quota file, nothing to do */ + return 1; +} + +/* + * There is no real quota file. Just allocate rb_root for quota ids and + * set limits + */ +static int shmem_read_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct mem_dqinfo *info = &dqopt->info[type]; + + info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS); + if (!info->dqi_priv) + return -ENOMEM; + + info->dqi_max_spc_limit = SHMEM_QUOTA_MAX_SPC_LIMIT; + info->dqi_max_ino_limit = SHMEM_QUOTA_MAX_INO_LIMIT; + + info->dqi_bgrace = SHMEM_MAX_DQ_TIME; + info->dqi_igrace = SHMEM_MAX_IQ_TIME; + info->dqi_flags = 0; + + return 0; +} + +static int shmem_write_file_info(struct super_block *sb, int type) +{ + /* There is no real quota file, nothing to do */ + return 0; +} + +/* + * Free all the quota_id entries in the rb tree and rb_root. + */ +static int shmem_free_file_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = &sb_dqopt(sb)->info[type]; + struct rb_root *root = info->dqi_priv; + struct quota_id *entry; + struct rb_node *node; + + info->dqi_priv = NULL; + node = rb_first(root); + while (node) { + entry = rb_entry(node, struct quota_id, node); + node = rb_next(&entry->node); + + rb_erase(&entry->node, root); + kfree(entry); + } + + kfree(root); + return 0; +} + +static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, qid->type); + struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + qid_t id = from_kqid(&init_user_ns, *qid); + struct quota_info *dqopt = sb_dqopt(sb); + struct quota_id *entry = NULL; + int ret = 0; + + if (!sb_has_quota_active(sb, qid->type)) + return -ESRCH; + + down_read(&dqopt->dqio_sem); + while (node) { + entry = rb_entry(node, struct quota_id, node); + + if (id < entry->id) + node = node->rb_left; + else if (id > entry->id) + node = node->rb_right; + else + goto got_next_id; + } + + if (!entry) { + ret = -ENOENT; + goto out_unlock; + } + + if (id > entry->id) { + node = rb_next(&entry->node); + if (!node) { + ret = -ENOENT; + goto out_unlock; + } + entry = rb_entry(node, struct quota_id, node); + } + +got_next_id: + *qid = make_kqid(&init_user_ns, qid->type, entry->id); +out_unlock: + up_read(&dqopt->dqio_sem); + return ret; +} + +/* + * Load dquot with limits from existing entry, or create the new entry if + * it does not exist. + */ +static int shmem_acquire_dquot(struct dquot *dquot) +{ + struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); + struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node; + struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; + struct rb_node *parent = NULL, *new_node = NULL; + struct quota_id *new_entry, *entry; + qid_t id = from_kqid(&init_user_ns, dquot->dq_id); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret = 0; + + mutex_lock(&dquot->dq_lock); + + down_write(&dqopt->dqio_sem); + while (*n) { + parent = *n; + entry = rb_entry(parent, struct quota_id, node); + + if (id < entry->id) + n = &(*n)->rb_left; + else if (id > entry->id) + n = &(*n)->rb_right; + else + goto found; + } + + /* We don't have entry for this id yet, create it */ + new_entry = kzalloc(sizeof(struct quota_id), GFP_NOFS); + if (!new_entry) { + ret = -ENOMEM; + goto out_unlock; + } + + new_entry->id = id; + if (dquot->dq_id.type == USRQUOTA) { + new_entry->bhardlimit = sbinfo->qlimits.usrquota_bhardlimit; + new_entry->ihardlimit = sbinfo->qlimits.usrquota_ihardlimit; + } else if (dquot->dq_id.type == GRPQUOTA) { + new_entry->bhardlimit = sbinfo->qlimits.grpquota_bhardlimit; + new_entry->ihardlimit = sbinfo->qlimits.grpquota_ihardlimit; + } + + new_node = &new_entry->node; + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, (struct rb_root *)info->dqi_priv); + entry = new_entry; + +found: + /* Load the stored limits from the tree */ + spin_lock(&dquot->dq_dqb_lock); + dquot->dq_dqb.dqb_bhardlimit = entry->bhardlimit; + dquot->dq_dqb.dqb_bsoftlimit = entry->bsoftlimit; + dquot->dq_dqb.dqb_ihardlimit = entry->ihardlimit; + dquot->dq_dqb.dqb_isoftlimit = entry->isoftlimit; + + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && + !dquot->dq_dqb.dqb_isoftlimit) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dquot->dq_dqb_lock); + + /* Make sure flags update is visible after dquot has been filled */ + smp_mb__before_atomic(); + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_unlock: + up_write(&dqopt->dqio_sem); + mutex_unlock(&dquot->dq_lock); + return ret; +} + +static bool shmem_is_empty_dquot(struct dquot *dquot) +{ + struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; + qsize_t bhardlimit; + qsize_t ihardlimit; + + if (dquot->dq_id.type == USRQUOTA) { + bhardlimit = sbinfo->qlimits.usrquota_bhardlimit; + ihardlimit = sbinfo->qlimits.usrquota_ihardlimit; + } else if (dquot->dq_id.type == GRPQUOTA) { + bhardlimit = sbinfo->qlimits.grpquota_bhardlimit; + ihardlimit = sbinfo->qlimits.grpquota_ihardlimit; + } + + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + (dquot->dq_dqb.dqb_curspace == 0 && + dquot->dq_dqb.dqb_curinodes == 0 && + dquot->dq_dqb.dqb_bhardlimit == bhardlimit && + dquot->dq_dqb.dqb_ihardlimit == ihardlimit)) + return true; + + return false; +} +/* + * Store limits from dquot in the tree unless it's fake. If it is fake + * remove the id from the tree since there is no useful information in + * there. + */ +static int shmem_release_dquot(struct dquot *dquot) +{ + struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); + struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + qid_t id = from_kqid(&init_user_ns, dquot->dq_id); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + struct quota_id *entry = NULL; + + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (dquot_is_busy(dquot)) + goto out_dqlock; + + down_write(&dqopt->dqio_sem); + while (node) { + entry = rb_entry(node, struct quota_id, node); + + if (id < entry->id) + node = node->rb_left; + else if (id > entry->id) + node = node->rb_right; + else + goto found; + } + + /* We should always find the entry in the rb tree */ + WARN_ONCE(1, "quota id %u from dquot %p, not in rb tree!\n", id, dquot); + up_write(&dqopt->dqio_sem); + mutex_unlock(&dquot->dq_lock); + return -ENOENT; + +found: + if (shmem_is_empty_dquot(dquot)) { + /* Remove entry from the tree */ + rb_erase(&entry->node, info->dqi_priv); + kfree(entry); + } else { + /* Store the limits in the tree */ + spin_lock(&dquot->dq_dqb_lock); + entry->bhardlimit = dquot->dq_dqb.dqb_bhardlimit; + entry->bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit; + entry->ihardlimit = dquot->dq_dqb.dqb_ihardlimit; + entry->isoftlimit = dquot->dq_dqb.dqb_isoftlimit; + spin_unlock(&dquot->dq_dqb_lock); + } + + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); + up_write(&dqopt->dqio_sem); + +out_dqlock: + mutex_unlock(&dquot->dq_lock); + return 0; +} + +static int shmem_mark_dquot_dirty(struct dquot *dquot) +{ + return 0; +} + +static int shmem_dquot_write_info(struct super_block *sb, int type) +{ + return 0; +} + +static const struct quota_format_ops shmem_format_ops = { + .check_quota_file = shmem_check_quota_file, + .read_file_info = shmem_read_file_info, + .write_file_info = shmem_write_file_info, + .free_file_info = shmem_free_file_info, +}; + +struct quota_format_type shmem_quota_format = { + .qf_fmt_id = QFMT_SHMEM, + .qf_ops = &shmem_format_ops, + .qf_owner = THIS_MODULE +}; + +const struct dquot_operations shmem_quota_operations = { + .acquire_dquot = shmem_acquire_dquot, + .release_dquot = shmem_release_dquot, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .write_info = shmem_dquot_write_info, + .mark_dirty = shmem_mark_dquot_dirty, + .get_next_id = shmem_get_next_id, +}; +#endif /* CONFIG_TMPFS_QUOTA */ diff --git a/mm/slab.c b/mm/slab.c index 88194391d553..9ad3d0f2d1a5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1670,7 +1670,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { freelist_cache_size = PAGE_SIZE << get_order(freelist_size); } else { - freelist_cache = kmalloc_slab(freelist_size, 0u); + freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_); if (!freelist_cache) continue; freelist_cache_size = freelist_cache->size; diff --git a/mm/slab.h b/mm/slab.h index 9c0e09d0f81f..799a315695c6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -282,7 +282,7 @@ void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ -struct kmem_cache *kmalloc_slab(size_t, gfp_t); +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, diff --git a/mm/slab_common.c b/mm/slab_common.c index d1555ea2981a..cd71f9581e67 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -678,6 +678,11 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = { /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ }; EXPORT_SYMBOL(kmalloc_caches); +#ifdef CONFIG_RANDOM_KMALLOC_CACHES +unsigned long random_kmalloc_seed __ro_after_init; +EXPORT_SYMBOL(random_kmalloc_seed); +#endif + /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -720,7 +725,7 @@ static inline unsigned int size_index_elem(unsigned int bytes) * Find the kmem_cache structure that serves a given size of * allocation */ -struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) { unsigned int index; @@ -735,7 +740,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) index = fls(size - 1); } - return kmalloc_caches[kmalloc_type(flags)][index]; + return kmalloc_caches[kmalloc_type(flags, caller)][index]; } size_t kmalloc_size_roundup(size_t size) @@ -752,8 +757,11 @@ size_t kmalloc_size_roundup(size_t size) if (size > KMALLOC_MAX_CACHE_SIZE) return PAGE_SIZE << get_order(size); - /* The flags don't matter since size_index is common to all. */ - c = kmalloc_slab(size, GFP_KERNEL); + /* + * The flags don't matter since size_index is common to all. + * Neither does the caller for just getting ->object_size. + */ + c = kmalloc_slab(size, GFP_KERNEL, 0); return c ? c->object_size : 0; } EXPORT_SYMBOL(kmalloc_size_roundup); @@ -776,12 +784,35 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_RCL_NAME(sz) #endif +#ifdef CONFIG_RANDOM_KMALLOC_CACHES +#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b +#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz) +#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz, +#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz, +#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz, +#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz, +#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz, +#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz, +#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz, +#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz, +#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz, +#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz, +#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz, +#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz, +#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz, +#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz, +#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz, +#else // CONFIG_RANDOM_KMALLOC_CACHES +#define KMALLOC_RANDOM_NAME(N, sz) +#endif + #define INIT_KMALLOC_INFO(__size, __short_size) \ { \ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ KMALLOC_RCL_NAME(__short_size) \ KMALLOC_CGROUP_NAME(__short_size) \ KMALLOC_DMA_NAME(__short_size) \ + KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \ .size = __size, \ } @@ -864,10 +895,9 @@ void __init setup_kmalloc_cache_index_table(void) static unsigned int __kmalloc_minalign(void) { -#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC - if (io_tlb_default_mem.nslabs) + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && + is_swiotlb_allocated()) return ARCH_KMALLOC_MINALIGN; -#endif return dma_get_cache_alignment(); } @@ -890,6 +920,11 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) flags |= SLAB_CACHE_DMA; } +#ifdef CONFIG_RANDOM_KMALLOC_CACHES + if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END) + flags |= SLAB_NO_MERGE; +#endif + /* * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for * KMALLOC_NORMAL caches. @@ -941,6 +976,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) new_kmalloc_cache(2, type, flags); } } +#ifdef CONFIG_RANDOM_KMALLOC_CACHES + random_kmalloc_seed = get_random_u64(); +#endif /* Kmalloc array is now usable */ slab_state = UP; @@ -976,7 +1014,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller return ret; } - s = kmalloc_slab(size, flags); + s = kmalloc_slab(size, flags, caller); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; diff --git a/mm/slub.c b/mm/slub.c index e3b5d5c0eb3a..f7940048138c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -361,43 +361,51 @@ static struct workqueue_struct *flushwq; *******************************************************************/ /* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + +/* * Returns freelist pointer (ptr). With hardening, this is obfuscated * with an XOR of the address where the pointer is held and a per-cache * random number. */ -static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, - unsigned long ptr_addr) +static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s, + void *ptr, unsigned long ptr_addr) { + unsigned long encoded; + #ifdef CONFIG_SLAB_FREELIST_HARDENED - /* - * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged. - * Normally, this doesn't cause any issues, as both set_freepointer() - * and get_freepointer() are called with a pointer with the same tag. - * However, there are some issues with CONFIG_SLUB_DEBUG code. For - * example, when __free_slub() iterates over objects in a cache, it - * passes untagged pointers to check_object(). check_object() in turns - * calls get_freepointer() with an untagged pointer, which causes the - * freepointer to be restored incorrectly. - */ - return (void *)((unsigned long)ptr ^ s->random ^ - swab((unsigned long)kasan_reset_tag((void *)ptr_addr))); + encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr); #else - return ptr; + encoded = (unsigned long)ptr; #endif + return (freeptr_t){.v = encoded}; } -/* Returns the freelist pointer recorded at location ptr_addr. */ -static inline void *freelist_dereference(const struct kmem_cache *s, - void *ptr_addr) +static inline void *freelist_ptr_decode(const struct kmem_cache *s, + freeptr_t ptr, unsigned long ptr_addr) { - return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), - (unsigned long)ptr_addr); + void *decoded; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr)); +#else + decoded = (void *)ptr.v; +#endif + return decoded; } static inline void *get_freepointer(struct kmem_cache *s, void *object) { + unsigned long ptr_addr; + freeptr_t p; + object = kasan_reset_tag(object); - return freelist_dereference(s, object + s->offset); + ptr_addr = (unsigned long)object + s->offset; + p = *(freeptr_t *)(ptr_addr); + return freelist_ptr_decode(s, p, ptr_addr); } #ifndef CONFIG_SLUB_TINY @@ -421,15 +429,15 @@ __no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { unsigned long freepointer_addr; - void *p; + freeptr_t p; if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); object = kasan_reset_tag(object); freepointer_addr = (unsigned long)object + s->offset; - copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); - return freelist_ptr(s, p, freepointer_addr); + copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); + return freelist_ptr_decode(s, p, freepointer_addr); } static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) @@ -441,7 +449,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) #endif freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); - *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr); } /* Loop over all objects in a slab */ diff --git a/mm/truncate.c b/mm/truncate.c index bd4fafd67f95..8e3aa9e8618e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -655,11 +655,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); - if (folio->mapping != mapping) { + if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); if (folio_mapped(folio)) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0fc69efa4f1f..96d9eae5c7cc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -86,7 +86,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (page_in_cache && !vm_shared) writable = false; if (writable) - _dst_pte = pte_mkwrite(_dst_pte); + _dst_pte = pte_mkwrite(_dst_pte, dst_vma); if (flags & MFILL_ATOMIC_WP) _dst_pte = pte_mkuffd_wp(_dst_pte); diff --git a/mm/util.c b/mm/util.c index f31e2ca62cfa..f08b655da917 100644 --- a/mm/util.c +++ b/mm/util.c @@ -396,7 +396,10 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlim_stack->rlim_cur == RLIM_INFINITY) + /* On parisc the stack always grows up - so a unlimited stack should + * not be an indicator to use the legacy memory layout. */ + if (rlim_stack->rlim_cur == RLIM_INFINITY && + !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; @@ -540,7 +543,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, + ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b52644771cc4..22c6689d9302 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -244,6 +244,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, if (mem_cgroup_disabled()) return; + /* + * The in-kernel users only care about the reclaim efficiency + * for this @memcg rather than the whole subtree, and there + * isn't and won't be any in-kernel user in a legacy cgroup. + */ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree) + return; + vmpr = memcg_to_vmpressure(memcg); /* |