summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c10
-rw-r--r--mm/debug_vm_pgtable.c12
-rw-r--r--mm/filemap.c65
-rw-r--r--mm/folio-compat.c2
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c17
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kfence/kfence_test.c7
-rw-r--r--mm/kfence/report.c3
-rw-r--r--mm/khugepaged.c13
-rw-r--r--mm/memory.c13
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page-writeback.c49
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c69
-rw-r--r--mm/readahead.c13
-rw-r--r--mm/shmem.c851
-rw-r--r--mm/shmem_quota.c350
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c54
-rw-r--r--mm/slub.c58
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/userfaultfd.c2
-rw-r--r--mm/util.c7
-rw-r--r--mm/vmpressure.c8
32 files changed, 1220 insertions, 446 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 721dc88423c7..264a2df5ecf5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -336,6 +336,23 @@ config SLUB_CPU_PARTIAL
which requires the taking of locks that may cause latency spikes.
Typically one would choose no for a realtime system.
+config RANDOM_KMALLOC_CACHES
+ default n
+ depends on SLUB && !SLUB_TINY
+ bool "Randomize slab caches for normal kmalloc"
+ help
+ A hardening feature that creates multiple copies of slab caches for
+ normal kmalloc allocation and makes kmalloc randomly pick one based
+ on code address, which makes the attackers more difficult to spray
+ vulnerable memory objects on the heap for the purpose of exploiting
+ memory vulnerabilities.
+
+ Currently the number of copies is set to 16, a reasonably large value
+ that effectively diverges the memory objects allocated for different
+ subsystems or modules into different caches, at the expense of a
+ limited degree of memory and CPU overhead that relates to hardware and
+ system workload.
+
endmenu # SLAB allocator options
config SHUFFLE_PAGE_ALLOCATOR
diff --git a/mm/Makefile b/mm/Makefile
index e6d9a1d5e84d..ec65984e2ade 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o show_mem.o\
+ compaction.o show_mem.o shmem_quota.o\
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
diff --git a/mm/cma.c b/mm/cma.c
index 4880f72102fa..da2967c6a223 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -267,6 +267,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (alignment && !is_power_of_2(alignment))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_NUMA))
+ nid = NUMA_NO_NODE;
+
/* Sanitise input arguments. */
alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
@@ -372,14 +375,15 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (ret)
goto free_mem;
- pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
- &base);
+ pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M,
+ &base, nid);
return 0;
free_mem:
memblock_phys_free(base, size);
err:
- pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+ pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M,
+ nid);
return ret;
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index d61eaa075c75..48e329ea5ba3 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -109,10 +109,10 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
- WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte))));
+ WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte), args->vma)));
WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
- WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte, args->vma))));
WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
@@ -156,7 +156,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
pte = pte_mkclean(pte);
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
flush_dcache_page(page);
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, args->vma);
pte = pte_mkdirty(pte);
ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1);
pte = ptep_get(args->ptep);
@@ -202,10 +202,10 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
- WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd), args->vma)));
WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
- WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd, args->vma))));
WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
/*
@@ -256,7 +256,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
pmd = pmd_mkclean(pmd);
set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
flush_dcache_page(page);
- pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkwrite(pmd, args->vma);
pmd = pmd_mkdirty(pmd);
pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
pmd = READ_ONCE(*args->pmdp);
diff --git a/mm/filemap.c b/mm/filemap.c
index 014b73eb96a1..bf6219d9aaac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1856,30 +1856,15 @@ out:
*
* Looks up the page cache entry at @mapping & @index.
*
- * @fgp_flags can be zero or more of these flags:
- *
- * * %FGP_ACCESSED - The folio will be marked accessed.
- * * %FGP_LOCK - The folio is returned locked.
- * * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp and added to the page cache and the VM's LRU list.
- * The page is returned locked and with an increased refcount.
- * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
- * page is already in cache. If the page was allocated, unlock it before
- * returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written to by the caller.
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
- * * %FGP_NOWAIT - Don't get blocked by page lock.
- * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
- *
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
- * If there is a page cache page, it is returned with an increased refcount.
+ * If this function returns a folio, it is returned with an increased refcount.
*
* Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
@@ -1921,7 +1906,9 @@ repeat:
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
+ unsigned order = FGF_GET_ORDER(fgp_flags);
int err;
+
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
@@ -1930,26 +1917,44 @@ no_page:
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
-
- folio = filemap_alloc_folio(gfp, 0);
- if (!folio)
- return ERR_PTR(-ENOMEM);
-
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
- /* Init accessed so avoid atomic mark_page_accessed later */
- if (fgp_flags & FGP_ACCESSED)
- __folio_set_referenced(folio);
+ if (!mapping_large_folio_support(mapping))
+ order = 0;
+ if (order > MAX_PAGECACHE_ORDER)
+ order = MAX_PAGECACHE_ORDER;
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
- err = filemap_add_folio(mapping, folio, index, gfp);
- if (unlikely(err)) {
+ do {
+ gfp_t alloc_gfp = gfp;
+
+ err = -ENOMEM;
+ if (order == 1)
+ order = 0;
+ if (order > 0)
+ alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ folio = filemap_alloc_folio(alloc_gfp, order);
+ if (!folio)
+ continue;
+
+ /* Init accessed so avoid atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ __folio_set_referenced(folio);
+
+ err = filemap_add_folio(mapping, folio, index, gfp);
+ if (!err)
+ break;
folio_put(folio);
folio = NULL;
- if (err == -EEXIST)
- goto repeat;
- }
+ } while (order-- > 0);
+ if (err == -EEXIST)
+ goto repeat;
+ if (err)
+ return ERR_PTR(err);
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index c6f056c20503..10c3247542cb 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru);
noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
diff --git a/mm/gup.c b/mm/gup.c
index 948f3b454b00..2f8a2d89fde1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1051,7 +1051,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
!writable_file_mapping_allowed(vma, gup_flags))
return -EFAULT;
- if (!(vm_flags & VM_WRITE)) {
+ if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
/* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c4635f750255..064fbd90822b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -551,7 +551,7 @@ __setup("transparent_hugepage=", setup_transparent_hugepage);
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
- pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkwrite(pmd, vma);
return pmd;
}
@@ -1566,7 +1566,7 @@ out_map:
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (writable)
- pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkwrite(pmd, vma);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
@@ -1675,6 +1675,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
tlb->fullmm);
+ arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
@@ -1919,7 +1920,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
/* See change_pte_range(). */
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
can_change_pmd_writable(vma, addr, entry))
- entry = pmd_mkwrite(entry);
+ entry = pmd_mkwrite(entry, vma);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
@@ -2233,7 +2234,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
if (write)
- entry = pte_mkwrite(entry);
+ entry = pte_mkwrite(entry, vma);
if (anon_exclusive)
SetPageAnonExclusive(page + i);
if (!young)
@@ -2501,7 +2502,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
- int i;
+ int i, nr_dropped = 0;
/* complete memcg works before add pages to LRU */
split_page_memcg(head, nr);
@@ -2524,7 +2525,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct folio *tail = page_folio(head + i);
if (shmem_mapping(head->mapping))
- shmem_uncharge(head->mapping->host, 1);
+ nr_dropped++;
else if (folio_test_clear_dirty(tail))
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
@@ -2561,6 +2562,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
+ if (nr_dropped)
+ shmem_uncharge(head->mapping->host, nr_dropped);
remap_page(folio, nr);
if (folio_test_swapcache(folio))
@@ -3263,7 +3266,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
- pmde = pmd_mkwrite(pmde);
+ pmde = pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_mkuffd_wp(pmde);
if (!is_migration_entry_young(entry))
diff --git a/mm/internal.h b/mm/internal.h
index d1d4bf4e63c0..30cf724ddbce 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -556,14 +556,14 @@ static inline bool is_exec_mapping(vm_flags_t flags)
}
/*
- * Stack area - automatically grows in one direction
+ * Stack area (including shadow stacks)
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
*/
static inline bool is_stack_mapping(vm_flags_t flags)
{
- return (flags & VM_STACK) == VM_STACK;
+ return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
}
/*
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 9e008a336d9f..95b2b84c296d 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -212,7 +212,9 @@ static void test_cache_destroy(void)
static inline size_t kmalloc_cache_alignment(size_t size)
{
- return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align;
+ /* just to get ->align so no need to pass in the real caller */
+ enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, 0);
+ return kmalloc_caches[type][__kmalloc_index(size, false)]->align;
}
/* Must always inline to match stack trace against caller. */
@@ -282,8 +284,9 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
if (is_kfence_address(alloc)) {
struct slab *slab = virt_to_slab(alloc);
+ enum kmalloc_cache_type type = kmalloc_type(GFP_KERNEL, _RET_IP_);
struct kmem_cache *s = test_cache ?:
- kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
+ kmalloc_caches[type][__kmalloc_index(size, false)];
/*
* Verify that various helpers return the right values
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 197430a5be4a..c509aed326ce 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -13,6 +13,7 @@
#include <linux/printk.h>
#include <linux/sched/debug.h>
#include <linux/seq_file.h>
+#include <linux/sprintf.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <trace/events/error_report.h>
@@ -26,8 +27,6 @@
#define ARCH_FUNC_PREFIX ""
#endif
-extern bool no_hash_pointers;
-
/* Helper function to either print to a seq_file or to console. */
__printf(2, 3)
static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d5650541083a..88433cc25d8a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1848,10 +1848,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto xa_locked;
}
}
- if (!shmem_charge(mapping->host, 1)) {
- result = SCAN_FAIL;
- goto xa_locked;
- }
nr_none++;
continue;
}
@@ -2037,8 +2033,13 @@ xa_unlocked:
*/
try_to_unmap_flush();
- if (result != SCAN_SUCCEED)
+ if (result == SCAN_SUCCEED && nr_none &&
+ !shmem_charge(mapping->host, nr_none))
+ result = SCAN_FAIL;
+ if (result != SCAN_SUCCEED) {
+ nr_none = 0;
goto rollback;
+ }
/*
* The old pages are locked, so they won't change anymore.
@@ -2177,8 +2178,8 @@ rollback:
if (nr_none) {
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
xas_unlock_irq(&xas);
+ shmem_uncharge(mapping->host, nr_none);
}
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
diff --git a/mm/memory.c b/mm/memory.c
index 00a5ce113090..6c264d2f969c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1430,6 +1430,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
+ arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
zap_install_uffd_wp_if_needed(vma, addr, pte, details,
ptent);
@@ -4124,7 +4125,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
entry = mk_pte(&folio->page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkwrite(pte_mkdirty(entry), vma);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
@@ -4842,7 +4843,7 @@ out_map:
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (writable)
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, vma);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -5307,11 +5308,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
- /* Even if this succeeds, make it clear we *might* have slept */
- if (likely(mmap_read_trylock(mm))) {
- might_sleep();
+ if (likely(mmap_read_trylock(mm)))
return true;
- }
if (regs && !user_mode(regs)) {
unsigned long ip = instruction_pointer(regs);
@@ -5744,6 +5742,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
/* Avoid triggering the temporary warning in __get_user_pages */
if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index e21d5a7e7447..b7fa020003f3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -220,7 +220,7 @@ static bool remove_migration_pte(struct folio *folio,
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(old_pte))
pte = pte_mkuffd_wp(pte);
@@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst,
}
EXPORT_SYMBOL(migrate_folio);
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
@@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
-#endif
+#endif /* CONFIG_BUFFER_HEAD */
int filemap_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index d69131adc51c..8ac1f79f754a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -624,7 +624,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkwrite(pte_mkdirty(entry), vma);
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index 514ced13c65c..b56a7f0c9f85 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1182,11 +1182,11 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff,
- unsigned long *populate, struct list_head *uf)
+ unsigned long flags, vm_flags_t vm_flags,
+ unsigned long pgoff, unsigned long *populate,
+ struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
@@ -1246,7 +1246,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -1564,7 +1564,7 @@ retry:
gap = mas.index;
gap += (info->align_offset - gap) & info->align_mask;
tmp = mas_next(&mas, ULONG_MAX);
- if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
if (vm_start_gap(tmp) < gap + length - 1) {
low_limit = tmp->vm_end;
mas_reset(&mas);
@@ -1616,7 +1616,7 @@ retry:
gap -= (gap - info->align_offset) & info->align_mask;
gap_end = mas.last;
tmp = mas_next(&mas, ULONG_MAX);
- if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
if (vm_start_gap(tmp) <= gap_end) {
high_limit = vm_start_gap(tmp);
mas_reset(&mas);
@@ -2998,7 +2998,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
- prot, flags, pgoff, &populate, NULL);
+ prot, flags, 0, pgoff, &populate, NULL);
fput(file);
out:
mmap_write_unlock(mm);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 130db91d3a8c..b94fbb45d5c7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -185,7 +185,7 @@ static long change_pte_range(struct mmu_gather *tlb,
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
!pte_write(ptent) &&
can_change_pte_writable(vma, addr, ptent))
- ptent = pte_mkwrite(ptent);
+ ptent = pte_mkwrite(ptent, vma);
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
if (pte_needs_flush(oldpte, ptent))
diff --git a/mm/nommu.c b/mm/nommu.c
index 8dba41cfc44d..7f9e9e5a0e12 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1016,6 +1016,7 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
+ vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1023,7 +1024,6 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
- vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
VMA_ITERATOR(vmi, current->mm, 0);
@@ -1043,7 +1043,7 @@ unsigned long do_mmap(struct file *file,
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
- vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+ vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d3f42009bb70..b8d3d7040a50 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1193,7 +1193,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
* write_bandwidth = ---------------------------------------------------
* period
*
- * @written may have decreased due to folio_account_redirty().
+ * @written may have decreased due to folio_redirty_for_writepage().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
@@ -2712,37 +2712,6 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
EXPORT_SYMBOL(filemap_dirty_folio);
/**
- * folio_account_redirty - Manually account for redirtying a page.
- * @folio: The folio which is being redirtied.
- *
- * Most filesystems should call folio_redirty_for_writepage() instead
- * of this fuction. If your filesystem is doing writeback outside the
- * context of a writeback_control(), it can call this when redirtying
- * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
- * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
- * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
- * in balanced_dirty_ratelimit and the dirty pages position control.
- */
-void folio_account_redirty(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
-
- if (mapping && mapping_can_writeback(mapping)) {
- struct inode *inode = mapping->host;
- struct bdi_writeback *wb;
- struct wb_lock_cookie cookie = {};
- long nr = folio_nr_pages(folio);
-
- wb = unlocked_inode_to_wb_begin(inode, &cookie);
- current->nr_dirtied -= nr;
- node_stat_mod_folio(folio, NR_DIRTIED, -nr);
- wb_stat_mod(wb, WB_DIRTIED, -nr);
- unlocked_inode_to_wb_end(inode, &cookie);
- }
-}
-EXPORT_SYMBOL(folio_account_redirty);
-
-/**
* folio_redirty_for_writepage - Decline to write a dirty folio.
* @wbc: The writeback control.
* @folio: The folio.
@@ -2757,13 +2726,23 @@ EXPORT_SYMBOL(folio_account_redirty);
bool folio_redirty_for_writepage(struct writeback_control *wbc,
struct folio *folio)
{
- bool ret;
+ struct address_space *mapping = folio->mapping;
long nr = folio_nr_pages(folio);
+ bool ret;
wbc->pages_skipped += nr;
- ret = filemap_dirty_folio(folio->mapping, folio);
- folio_account_redirty(folio);
+ ret = filemap_dirty_folio(mapping, folio);
+ if (mapping && mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ current->nr_dirtied -= nr;
+ node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+ wb_stat_mod(wb, WB_DIRTIED, -nr);
+ unlocked_inode_to_wb_end(inode, &cookie);
+ }
return ret;
}
EXPORT_SYMBOL(folio_redirty_for_writepage);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 9b2d23fbf4d3..b7d7e4fcfad7 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_offset_map(pmd, addr);
if (pte) {
err = walk_pte_range_inner(pte, addr, end, walk);
- if (walk->mm != &init_mm)
+ if (walk->mm != &init_mm && addr < TASK_SIZE)
pte_unmap(pte);
}
} else {
diff --git a/mm/percpu.c b/mm/percpu.c
index 28e07ede46f6..a7665de8485f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1890,13 +1890,15 @@ fail_unlock:
fail:
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
- if (!is_atomic && do_warn && warn_limit) {
+ if (do_warn && warn_limit) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
size, align, is_atomic, err);
- dump_stack();
+ if (!is_atomic)
+ dump_stack();
if (!--warn_limit)
pr_info("limit reached, disable warning\n");
}
+
if (is_atomic) {
/* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
@@ -2581,14 +2583,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
{
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
size_t static_size, dyn_size;
- struct pcpu_chunk *chunk;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;
- int map_size;
unsigned long tmp_addr;
size_t alloc_size;
@@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
- PCPU_SETUP_BUG_ON(!ai->dyn_size);
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
@@ -2698,7 +2697,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
- pcpu_chunk_struct_size = struct_size(chunk, populated,
+ pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
BITS_TO_LONGS(pcpu_unit_pages));
pcpu_stats_save_ai(ai);
@@ -2735,29 +2734,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
dyn_size = ai->dyn_size - (static_size - ai->static_size);
/*
- * Initialize first chunk.
- * If the reserved_size is non-zero, this initializes the reserved
- * chunk. If the reserved_size is zero, the reserved chunk is NULL
- * and the dynamic region is initialized here. The first chunk,
- * pcpu_first_chunk, will always point to the chunk that serves
- * the dynamic region.
+ * Initialize first chunk:
+ * This chunk is broken up into 3 parts:
+ * < static | [reserved] | dynamic >
+ * - static - there is no backing chunk because these allocations can
+ * never be freed.
+ * - reserved (pcpu_reserved_chunk) - exists primarily to serve
+ * allocations from module load.
+ * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
+ * chunk.
*/
tmp_addr = (unsigned long)base_addr + static_size;
- map_size = ai->reserved_size ?: dyn_size;
- chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
-
- /* init dynamic chunk if necessary */
- if (ai->reserved_size) {
- pcpu_reserved_chunk = chunk;
+ if (ai->reserved_size)
+ pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
+ ai->reserved_size);
+ tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
+ pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
- tmp_addr = (unsigned long)base_addr + static_size +
- ai->reserved_size;
- map_size = dyn_size;
- chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
- }
-
- /* link the first chunk in */
- pcpu_first_chunk = chunk;
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -3189,32 +3182,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
pmd_t *pmd;
if (pgd_none(*pgd)) {
- p4d_t *new;
-
- new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- if (!new)
+ p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
+ if (!p4d)
goto err_alloc;
- pgd_populate(&init_mm, pgd, new);
+ pgd_populate(&init_mm, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
- pud_t *new;
-
- new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- if (!new)
+ pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ if (!pud)
goto err_alloc;
- p4d_populate(&init_mm, p4d, new);
+ p4d_populate(&init_mm, p4d, pud);
}
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
- pmd_t *new;
-
- new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
- if (!new)
+ pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+ if (!pmd)
goto err_alloc;
- pud_populate(&init_mm, pud, new);
+ pud_populate(&init_mm, pud, pmd);
}
pmd = pmd_offset(pud, addr);
diff --git a/mm/readahead.c b/mm/readahead.c
index a9c999aa19af..e815c114de21 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -461,19 +461,6 @@ static int try_context_readahead(struct address_space *mapping,
return 1;
}
-/*
- * There are some parts of the kernel which assume that PMD entries
- * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
- * limit the maximum allocation order to PMD size. I'm not aware of any
- * assumptions about maximum order if THP are disabled, but 8 seems like
- * a good order (that's 1MB if you're using 4kB pages)
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
-#else
-#define MAX_PAGECACHE_ORDER 8
-#endif
-
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
diff --git a/mm/shmem.c b/mm/shmem.c
index 980289be5f63..02e62fccc80d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,6 +78,7 @@ static struct vfsmount *shm_mnt;
#include <uapi/linux/memfd.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
+#include <linux/quotaops.h>
#include <linux/uaccess.h>
@@ -89,6 +90,9 @@ static struct vfsmount *shm_mnt;
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
+/* Pretend that one inode + its dentry occupy this much memory */
+#define BOGO_INODE_SIZE 1024
+
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128
@@ -116,11 +120,14 @@ struct shmem_options {
int huge;
int seen;
bool noswap;
+ unsigned short quota_types;
+ struct shmem_quota_limits qlimits;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_NOSWAP 16
+#define SHMEM_SEEN_QUOTA 32
};
#ifdef CONFIG_TMPFS
@@ -133,7 +140,8 @@ static unsigned long shmem_default_max_inodes(void)
{
unsigned long nr_pages = totalram_pages();
- return min(nr_pages - totalhigh_pages(), nr_pages / 2);
+ return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
+ ULONG_MAX / BOGO_INODE_SIZE);
}
#endif
@@ -199,33 +207,47 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+static int shmem_inode_acct_block(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ int err = -ENOSPC;
if (shmem_acct_block(info->flags, pages))
- return false;
+ return err;
+ might_sleep(); /* when quotas */
if (sbinfo->max_blocks) {
if (percpu_counter_compare(&sbinfo->used_blocks,
sbinfo->max_blocks - pages) > 0)
goto unacct;
+
+ err = dquot_alloc_block_nodirty(inode, pages);
+ if (err)
+ goto unacct;
+
percpu_counter_add(&sbinfo->used_blocks, pages);
+ } else {
+ err = dquot_alloc_block_nodirty(inode, pages);
+ if (err)
+ goto unacct;
}
- return true;
+ return 0;
unacct:
shmem_unacct_blocks(info->flags, pages);
- return false;
+ return err;
}
-static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ might_sleep(); /* when quotas */
+ dquot_free_block_nodirty(inode, pages);
+
if (sbinfo->max_blocks)
percpu_counter_sub(&sbinfo->used_blocks, pages);
shmem_unacct_blocks(info->flags, pages);
@@ -254,6 +276,47 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
+#ifdef CONFIG_TMPFS_QUOTA
+
+static int shmem_enable_quotas(struct super_block *sb,
+ unsigned short quota_types)
+{
+ int type, err = 0;
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+ for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
+ if (!(quota_types & (1 << type)))
+ continue;
+ err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
+ DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
+ if (err)
+ goto out_err;
+ }
+ return 0;
+
+out_err:
+ pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
+ type, err);
+ for (type--; type >= 0; type--)
+ dquot_quota_off(sb, type);
+ return err;
+}
+
+static void shmem_disable_quotas(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < SHMEM_MAXQUOTAS; type++)
+ dquot_quota_off(sb, type);
+}
+
+static struct dquot **shmem_get_dquots(struct inode *inode)
+{
+ return SHMEM_I(inode)->i_dquot;
+}
+#endif /* CONFIG_TMPFS_QUOTA */
+
/*
* shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
* produces a novel ino for the newly allocated inode.
@@ -272,11 +335,11 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
if (!(sb->s_flags & SB_KERNMOUNT)) {
raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
- if (!sbinfo->free_inodes) {
+ if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
- sbinfo->free_inodes--;
+ sbinfo->free_ispace -= BOGO_INODE_SIZE;
}
if (inop) {
ino = sbinfo->next_ino++;
@@ -330,12 +393,12 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
return 0;
}
-static void shmem_free_inode(struct super_block *sb)
+static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
raw_spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
+ sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -343,62 +406,65 @@ static void shmem_free_inode(struct super_block *sb)
/**
* shmem_recalc_inode - recalculate the block usage of an inode
* @inode: inode to recalc
+ * @alloced: the change in number of pages allocated to inode
+ * @swapped: the change in number of pages swapped from inode
*
* We have to calculate the free blocks since the mm can drop
* undirtied hole pages behind our back.
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
- *
- * It has to be called with the spinlock held.
*/
-static void shmem_recalc_inode(struct inode *inode)
+static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
long freed;
- freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
- if (freed > 0) {
+ spin_lock(&info->lock);
+ info->alloced += alloced;
+ info->swapped += swapped;
+ freed = info->alloced - info->swapped -
+ READ_ONCE(inode->i_mapping->nrpages);
+ /*
+ * Special case: whereas normally shmem_recalc_inode() is called
+ * after i_mapping->nrpages has already been adjusted (up or down),
+ * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * to stop a racing shmem_recalc_inode() from thinking that a page has
+ * been freed. Compensate here, to avoid the need for a followup call.
+ */
+ if (swapped > 0)
+ freed += swapped;
+ if (freed > 0)
info->alloced -= freed;
- inode->i_blocks -= freed * BLOCKS_PER_PAGE;
+ spin_unlock(&info->lock);
+
+ /* The quota case may block */
+ if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
- }
}
bool shmem_charge(struct inode *inode, long pages)
{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long flags;
+ struct address_space *mapping = inode->i_mapping;
- if (!shmem_inode_acct_block(inode, pages))
+ if (shmem_inode_acct_block(inode, pages))
return false;
/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
- inode->i_mapping->nrpages += pages;
-
- spin_lock_irqsave(&info->lock, flags);
- info->alloced += pages;
- inode->i_blocks += pages * BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
+ xa_lock_irq(&mapping->i_pages);
+ mapping->nrpages += pages;
+ xa_unlock_irq(&mapping->i_pages);
+ shmem_recalc_inode(inode, pages, 0);
return true;
}
void shmem_uncharge(struct inode *inode, long pages)
{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long flags;
-
+ /* pages argument is currently unused: keep it to help debugging */
/* nrpages adjustment done by __filemap_remove_folio() or caller */
- spin_lock_irqsave(&info->lock, flags);
- info->alloced -= pages;
- inode->i_blocks -= pages * BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
-
- shmem_inode_unacct_blocks(inode, pages);
+ shmem_recalc_inode(inode, 0, 0);
}
/*
@@ -1040,16 +1106,13 @@ whole_folios:
folio_batch_release(&fbatch);
}
- spin_lock_irq(&info->lock);
- info->swapped -= nr_swaps_freed;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1061,11 +1124,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- }
+ if (info->alloced - info->swapped != inode->i_mapping->nrpages)
+ shmem_recalc_inode(inode, 0, 0);
+
if (info->fsflags & FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (info->fsflags & FS_IMMUTABLE_FL)
@@ -1075,7 +1136,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
if (shmem_is_huge(inode, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
@@ -1142,13 +1203,28 @@ static int shmem_setattr(struct mnt_idmap *idmap,
}
}
+ if (is_quota_modification(idmap, inode, attr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
+
+ /* Transfer quota accounting */
+ if (i_uid_needs_update(idmap, attr, inode) ||
+ i_gid_needs_update(idmap, attr, inode)) {
+ error = dquot_transfer(idmap, inode, attr);
+
+ if (error)
+ return error;
+ }
+
setattr_copy(idmap, inode, attr);
if (attr->ia_valid & ATTR_MODE)
error = posix_acl_chmod(idmap, dentry, inode->i_mode);
if (!error && update_ctime) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (update_mtime)
- inode->i_mtime = inode->i_ctime;
+ inode->i_mtime = inode_get_ctime(inode);
inode_inc_iversion(inode);
}
return error;
@@ -1158,6 +1234,7 @@ static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ size_t freed = 0;
if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
@@ -1184,10 +1261,14 @@ static void shmem_evict_inode(struct inode *inode)
}
}
- simple_xattrs_free(&info->xattrs);
+ simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
+ shmem_free_inode(inode->i_sb, freed);
WARN_ON(inode->i_blocks);
- shmem_free_inode(inode->i_sb);
clear_inode(inode);
+#ifdef CONFIG_TMPFS_QUOTA
+ dquot_free_inode(inode);
+ dquot_drop(inode);
+#endif
}
static int shmem_find_swap_entries(struct address_space *mapping,
@@ -1431,11 +1512,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- info->swapped++;
- spin_unlock_irq(&info->lock);
-
+ shmem_recalc_inode(inode, 0, 1);
swap_shmem_alloc(swap);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
@@ -1590,13 +1667,14 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
struct shmem_inode_info *info = SHMEM_I(inode);
struct folio *folio;
int nr;
- int err = -ENOSPC;
+ int err;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
- if (!shmem_inode_acct_block(inode, nr))
+ err = shmem_inode_acct_block(inode, nr);
+ if (err)
goto failed;
if (huge)
@@ -1705,7 +1783,6 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
struct folio *folio, swp_entry_t swap)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swapin_error;
void *old;
@@ -1718,16 +1795,12 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
folio_wait_writeback(folio);
delete_from_swap_cache(folio);
- spin_lock_irq(&info->lock);
/*
- * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
- * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
- * shmem_evict_inode.
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
+ * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
+ * in shmem_evict_inode().
*/
- info->alloced--;
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, -1, -1);
swap_free(swap);
}
@@ -1814,10 +1887,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (error)
goto failed;
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, -1);
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
@@ -1982,13 +2052,9 @@ alloc_nohuge:
charge_mm);
if (error)
goto unacct;
- folio_add_lru(folio);
- spin_lock_irq(&info->lock);
- info->alloced += folio_nr_pages(folio);
- inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ folio_add_lru(folio);
+ shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
alloced = true;
if (folio_test_pmd_mappable(folio) &&
@@ -2037,9 +2103,7 @@ clear:
if (alloced) {
folio_clear_dirty(folio);
filemap_remove_folio(folio);
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, 0);
}
error = -EINVAL;
goto unlock;
@@ -2065,9 +2129,7 @@ unlock:
folio_put(folio);
}
if (error == -ENOSPC && !once++) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, 0);
goto repeat;
}
if (error == -EEXIST)
@@ -2328,6 +2390,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int shmem_file_open(struct inode *inode, struct file *file)
+{
+ file->f_mode |= FMODE_CAN_ODIRECT;
+ return generic_file_open(inode, file);
+}
+
#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
@@ -2357,78 +2425,128 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
#define shmem_initxattrs NULL
#endif
-static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
- struct inode *dir, umode_t mode, dev_t dev,
- unsigned long flags)
+static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
+{
+ return &SHMEM_I(inode)->dir_offsets;
+}
+
+static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb,
+ struct inode *dir, umode_t mode,
+ dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
ino_t ino;
+ int err;
+
+ err = shmem_reserve_inode(sb, &ino);
+ if (err)
+ return ERR_PTR(err);
- if (shmem_reserve_inode(sb, &ino))
- return NULL;
inode = new_inode(sb);
- if (inode) {
- inode->i_ino = ino;
- inode_init_owner(idmap, inode, dir, mode);
- inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = get_random_u32();
- info = SHMEM_I(inode);
- memset(info, 0, (char *)inode - (char *)info);
- spin_lock_init(&info->lock);
- atomic_set(&info->stop_eviction, 0);
- info->seals = F_SEAL_SEAL;
- info->flags = flags & VM_NORESERVE;
- info->i_crtime = inode->i_mtime;
- info->fsflags = (dir == NULL) ? 0 :
- SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
- if (info->fsflags)
- shmem_set_inode_flags(inode, info->fsflags);
- INIT_LIST_HEAD(&info->shrinklist);
- INIT_LIST_HEAD(&info->swaplist);
- if (sbinfo->noswap)
- mapping_set_unevictable(inode->i_mapping);
- simple_xattrs_init(&info->xattrs);
- cache_no_acl(inode);
- mapping_set_large_folios(inode->i_mapping);
-
- switch (mode & S_IFMT) {
- default:
- inode->i_op = &shmem_special_inode_operations;
- init_special_inode(inode, mode, dev);
- break;
- case S_IFREG:
- inode->i_mapping->a_ops = &shmem_aops;
- inode->i_op = &shmem_inode_operations;
- inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy,
- shmem_get_sbmpol(sbinfo));
- break;
- case S_IFDIR:
- inc_nlink(inode);
- /* Some things misbehave if size == 0 on a directory */
- inode->i_size = 2 * BOGO_DIRENT_SIZE;
- inode->i_op = &shmem_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- break;
- case S_IFLNK:
- /*
- * Must not load anything in the rbtree,
- * mpol_free_shared_policy will not be called.
- */
- mpol_shared_policy_init(&info->policy, NULL);
- break;
- }
+ if (!inode) {
+ shmem_free_inode(sb, 0);
+ return ERR_PTR(-ENOSPC);
+ }
- lockdep_annotate_inode_mutex_key(inode);
- } else
- shmem_free_inode(sb);
+ inode->i_ino = ino;
+ inode_init_owner(idmap, inode, dir, mode);
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ inode->i_generation = get_random_u32();
+ info = SHMEM_I(inode);
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
+ info->seals = F_SEAL_SEAL;
+ info->flags = flags & VM_NORESERVE;
+ info->i_crtime = inode->i_mtime;
+ info->fsflags = (dir == NULL) ? 0 :
+ SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+ if (info->fsflags)
+ shmem_set_inode_flags(inode, info->fsflags);
+ INIT_LIST_HEAD(&info->shrinklist);
+ INIT_LIST_HEAD(&info->swaplist);
+ INIT_LIST_HEAD(&info->swaplist);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
+ simple_xattrs_init(&info->xattrs);
+ cache_no_acl(inode);
+ mapping_set_large_folios(inode->i_mapping);
+
+ switch (mode & S_IFMT) {
+ default:
+ inode->i_op = &shmem_special_inode_operations;
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_inode_operations;
+ inode->i_fop = &shmem_file_operations;
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ /* Some things misbehave if size == 0 on a directory */
+ inode->i_size = 2 * BOGO_DIRENT_SIZE;
+ inode->i_op = &shmem_dir_inode_operations;
+ inode->i_fop = &simple_offset_dir_operations;
+ simple_offset_init(shmem_get_offset_ctx(inode));
+ break;
+ case S_IFLNK:
+ /*
+ * Must not load anything in the rbtree,
+ * mpol_free_shared_policy will not be called.
+ */
+ mpol_shared_policy_init(&info->policy, NULL);
+ break;
+ }
+
+ lockdep_annotate_inode_mutex_key(inode);
return inode;
}
+#ifdef CONFIG_TMPFS_QUOTA
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ int err;
+ struct inode *inode;
+
+ inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
+ if (IS_ERR(inode))
+ return inode;
+
+ err = dquot_initialize(inode);
+ if (err)
+ goto errout;
+
+ err = dquot_alloc_inode(inode);
+ if (err) {
+ dquot_drop(inode);
+ goto errout;
+ }
+ return inode;
+
+errout:
+ inode->i_flags |= S_NOQUOTA;
+ iput(inode);
+ return ERR_PTR(err);
+}
+#else
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
+}
+#endif /* CONFIG_TMPFS_QUOTA */
+
#ifdef CONFIG_USERFAULTFD
int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -2447,7 +2565,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
int ret;
pgoff_t max_off;
- if (!shmem_inode_acct_block(inode, 1)) {
+ if (shmem_inode_acct_block(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
* and now we find ourselves with -ENOMEM. Release the page, to
@@ -2529,12 +2647,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
if (ret)
goto out_delete_from_cache;
- spin_lock_irq(&info->lock);
- info->alloced++;
- inode->i_blocks += BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
+ shmem_recalc_inode(inode, 1, 0);
folio_unlock(folio);
return 0;
out_delete_from_cache:
@@ -2733,6 +2846,28 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
+static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+ ret = file_remove_privs(file);
+ if (ret)
+ goto unlock;
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+ ret = generic_perform_write(iocb, from);
+unlock:
+ inode_unlock(inode);
+ return ret;
+}
+
static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -3057,7 +3192,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
- buf->f_ffree = sbinfo->free_inodes;
+ buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
}
/* else leave those fields 0 like simple_statfs */
@@ -3074,27 +3209,32 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
+ int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
- if (inode) {
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- error = security_inode_init_security(inode, dir,
- &dentry->d_name,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- error = 0;
- dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- }
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (error)
+ goto out_iput;
+
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_mtime = inode_set_ctime_current(dir);
+ inode_inc_iversion(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
return error;
+
out_iput:
iput(inode);
return error;
@@ -3105,20 +3245,26 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
struct inode *inode;
- int error = -ENOSPC;
+ int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
- if (inode) {
- error = security_inode_init_security(inode, dir,
- NULL,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- d_tmpfile(file, inode);
+
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ goto err_out;
}
+
+ error = security_inode_init_security(inode, dir,
+ NULL,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ d_tmpfile(file, inode);
+
+err_out:
return finish_open_simple(file, error);
out_iput:
iput(inode);
@@ -3164,8 +3310,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
goto out;
}
+ ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (ret) {
+ if (inode->i_nlink)
+ shmem_free_inode(inode->i_sb, 0);
+ goto out;
+ }
+
dir->i_size += BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
@@ -3180,10 +3334,13 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
- shmem_free_inode(inode->i_sb);
+ shmem_free_inode(inode->i_sb, 0);
+
+ simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
@@ -3240,24 +3397,29 @@ static int shmem_rename2(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
+ int error;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ return simple_offset_rename_exchange(old_dir, old_dentry,
+ new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
- int error;
-
error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
+ simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry);
+ error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry);
+ if (error)
+ return error;
+
if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {
@@ -3271,9 +3433,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
old_dir->i_size -= BOGO_DIRENT_SIZE;
new_dir->i_size += BOGO_DIRENT_SIZE;
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- inode->i_ctime = current_time(old_dir);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
return 0;
@@ -3293,31 +3453,32 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
- if (!inode)
- return -ENOSPC;
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (error)
+ goto out_iput;
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) {
- iput(inode);
- return -ENOMEM;
+ error = -ENOMEM;
+ goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
- if (error) {
- iput(inode);
- return error;
- }
+ if (error)
+ goto out_remove_offset;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
@@ -3327,11 +3488,17 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
+
+out_remove_offset:
+ simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
+out_iput:
+ iput(inode);
+ return error;
}
static void shmem_put_link(void *arg)
@@ -3399,7 +3566,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
(fa->flags & SHMEM_FL_USER_MODIFIABLE);
shmem_set_inode_flags(inode, info->fsflags);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
return 0;
}
@@ -3419,21 +3586,40 @@ static int shmem_initxattrs(struct inode *inode,
void *fs_info)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
const struct xattr *xattr;
struct simple_xattr *new_xattr;
+ size_t ispace = 0;
size_t len;
+ if (sbinfo->max_inodes) {
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ ispace += simple_xattr_space(xattr->name,
+ xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
+ }
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_ispace < ispace)
+ ispace = 0;
+ else
+ sbinfo->free_ispace -= ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (!ispace)
+ return -ENOSPC;
+ }
+ }
+
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
if (!new_xattr)
- return -ENOMEM;
+ break;
len = strlen(xattr->name) + 1;
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!new_xattr->name) {
kvfree(new_xattr);
- return -ENOMEM;
+ break;
}
memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
@@ -3444,6 +3630,16 @@ static int shmem_initxattrs(struct inode *inode,
simple_xattr_add(&info->xattrs, new_xattr);
}
+ if (xattr->name != NULL) {
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_ispace += ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ }
+ simple_xattrs_free(&info->xattrs, NULL);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3464,15 +3660,40 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- int err;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct simple_xattr *old_xattr;
+ size_t ispace = 0;
name = xattr_full_name(handler, name);
- err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
- if (!err) {
- inode->i_ctime = current_time(inode);
+ if (value && sbinfo->max_inodes) {
+ ispace = simple_xattr_space(name, size);
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_ispace < ispace)
+ ispace = 0;
+ else
+ sbinfo->free_ispace -= ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (!ispace)
+ return -ENOSPC;
+ }
+
+ old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
+ if (!IS_ERR(old_xattr)) {
+ ispace = 0;
+ if (old_xattr && sbinfo->max_inodes)
+ ispace = simple_xattr_space(old_xattr->name,
+ old_xattr->size);
+ simple_xattr_free(old_xattr);
+ old_xattr = NULL;
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
}
- return err;
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_ispace += ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ }
+ return PTR_ERR(old_xattr);
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3487,9 +3708,16 @@ static const struct xattr_handler shmem_trusted_xattr_handler = {
.set = shmem_xattr_handler_set,
};
+static const struct xattr_handler shmem_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
+
static const struct xattr_handler *shmem_xattr_handlers[] = {
&shmem_security_xattr_handler,
&shmem_trusted_xattr_handler,
+ &shmem_user_xattr_handler,
NULL
};
@@ -3502,6 +3730,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
static const struct inode_operations shmem_short_symlink_operations = {
.getattr = shmem_getattr,
+ .setattr = shmem_setattr,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3510,6 +3739,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
static const struct inode_operations shmem_symlink_inode_operations = {
.getattr = shmem_getattr,
+ .setattr = shmem_setattr,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3609,6 +3839,13 @@ enum shmem_param {
Opt_inode32,
Opt_inode64,
Opt_noswap,
+ Opt_quota,
+ Opt_usrquota,
+ Opt_grpquota,
+ Opt_usrquota_block_hardlimit,
+ Opt_usrquota_inode_hardlimit,
+ Opt_grpquota_block_hardlimit,
+ Opt_grpquota_inode_hardlimit,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3631,6 +3868,15 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_flag ("inode32", Opt_inode32),
fsparam_flag ("inode64", Opt_inode64),
fsparam_flag ("noswap", Opt_noswap),
+#ifdef CONFIG_TMPFS_QUOTA
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
+ fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
+ fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
+ fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
+#endif
{}
};
@@ -3641,6 +3887,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
unsigned long long size;
char *rest;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
opt = fs_parse(fc, shmem_fs_parameters, param, &result);
if (opt < 0)
@@ -3662,13 +3910,13 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest || ctx->blocks > S64_MAX)
+ if (*rest || ctx->blocks > LONG_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
case Opt_nr_inodes:
ctx->inodes = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
goto bad_value;
ctx->seen |= SHMEM_SEEN_INODES;
break;
@@ -3676,14 +3924,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->mode = result.uint_32 & 07777;
break;
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
+ kuid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(kuid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, kuid))
goto bad_value;
+
+ ctx->uid = kuid;
break;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
+ kgid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(kgid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, kgid))
goto bad_value;
+
+ ctx->gid = kgid;
break;
case Opt_huge:
ctx->huge = result.uint_32;
@@ -3722,6 +3988,60 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->noswap = true;
ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
+ case Opt_quota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
+ break;
+ case Opt_usrquota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= QTYPE_MASK_USR;
+ break;
+ case Opt_grpquota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= QTYPE_MASK_GRP;
+ break;
+ case Opt_usrquota_block_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
+ return invalfc(fc,
+ "User quota block hardlimit too large.");
+ ctx->qlimits.usrquota_bhardlimit = size;
+ break;
+ case Opt_grpquota_block_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
+ return invalfc(fc,
+ "Group quota block hardlimit too large.");
+ ctx->qlimits.grpquota_bhardlimit = size;
+ break;
+ case Opt_usrquota_inode_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
+ return invalfc(fc,
+ "User quota inode hardlimit too large.");
+ ctx->qlimits.usrquota_ihardlimit = size;
+ break;
+ case Opt_grpquota_inode_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
+ return invalfc(fc,
+ "Group quota inode hardlimit too large.");
+ ctx->qlimits.grpquota_ihardlimit = size;
+ break;
}
return 0;
@@ -3777,21 +4097,17 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
/*
* Reconfigure a shmem filesystem.
- *
- * Note that we disallow change from limited->unlimited blocks/inodes while any
- * are in use; but we must separately disallow unlimited->limited, because in
- * that case we have no record of how much is already in use.
*/
static int shmem_reconfigure(struct fs_context *fc)
{
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
- unsigned long inodes;
+ unsigned long used_isp;
struct mempolicy *mpol = NULL;
const char *err;
raw_spin_lock(&sbinfo->stat_lock);
- inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
@@ -3809,7 +4125,7 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Cannot retroactively limit inodes";
goto out;
}
- if (ctx->inodes < inodes) {
+ if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
err = "Too few inodes for current use";
goto out;
}
@@ -3829,6 +4145,24 @@ static int shmem_reconfigure(struct fs_context *fc)
goto out;
}
+ if (ctx->seen & SHMEM_SEEN_QUOTA &&
+ !sb_any_quota_loaded(fc->root->d_sb)) {
+ err = "Cannot enable quota on remount";
+ goto out;
+ }
+
+#ifdef CONFIG_TMPFS_QUOTA
+#define CHANGED_LIMIT(name) \
+ (ctx->qlimits.name## hardlimit && \
+ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
+
+ if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
+ CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
+ err = "Cannot change global quota limit on remount";
+ goto out;
+ }
+#endif /* CONFIG_TMPFS_QUOTA */
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
if (ctx->seen & SHMEM_SEEN_INUMS)
@@ -3837,7 +4171,7 @@ static int shmem_reconfigure(struct fs_context *fc)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
sbinfo->max_inodes = ctx->inodes;
- sbinfo->free_inodes = ctx->inodes - inodes;
+ sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
}
/*
@@ -3919,6 +4253,9 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+#ifdef CONFIG_TMPFS_QUOTA
+ shmem_disable_quotas(sb);
+#endif
free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
@@ -3931,12 +4268,13 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
+ int error = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
L1_CACHE_BYTES), GFP_KERNEL);
if (!sbinfo)
- return -ENOMEM;
+ return error;
sb->s_fs_info = sbinfo;
@@ -3963,7 +4301,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
#endif
sbinfo->max_blocks = ctx->blocks;
- sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
if (sb->s_flags & SB_KERNMOUNT) {
sbinfo->ino_batch = alloc_percpu(ino_t);
if (!sbinfo->ino_batch)
@@ -3997,10 +4336,27 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
uuid_gen(&sb->s_uuid);
+#ifdef CONFIG_TMPFS_QUOTA
+ if (ctx->seen & SHMEM_SEEN_QUOTA) {
+ sb->dq_op = &shmem_quota_operations;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+
+ /* Copy the default limits from ctx into sbinfo */
+ memcpy(&sbinfo->qlimits, &ctx->qlimits,
+ sizeof(struct shmem_quota_limits));
+
+ if (shmem_enable_quotas(sb, ctx->quota_types))
+ goto failed;
+ }
+#endif /* CONFIG_TMPFS_QUOTA */
+
inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
VM_NORESERVE);
- if (!inode)
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
goto failed;
+ }
inode->i_uid = sbinfo->uid;
inode->i_gid = sbinfo->gid;
sb->s_root = d_make_root(inode);
@@ -4010,7 +4366,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
failed:
shmem_put_super(sb);
- return -ENOMEM;
+ return error;
}
static int shmem_get_tree(struct fs_context *fc)
@@ -4060,6 +4416,8 @@ static void shmem_destroy_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ if (S_ISDIR(inode->i_mode))
+ simple_offset_destroy(shmem_get_offset_ctx(inode));
}
static void shmem_init_inode(void *foo)
@@ -4103,12 +4461,12 @@ EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
- .open = generic_file_open,
+ .open = shmem_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
.read_iter = shmem_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .write_iter = shmem_file_write_iter,
.fsync = noop_fsync,
.splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -4140,6 +4498,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.mknod = shmem_mknod,
.rename = shmem_rename2,
.tmpfile = shmem_tmpfile,
+ .get_offset_ctx = shmem_get_offset_ctx,
#endif
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -4171,6 +4530,9 @@ static const struct super_operations shmem_ops = {
.statfs = shmem_statfs,
.show_options = shmem_show_options,
#endif
+#ifdef CONFIG_TMPFS_QUOTA
+ .get_dquots = shmem_get_dquots,
+#endif
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
@@ -4224,7 +4586,7 @@ static struct file_system_type shmem_fs_type = {
#endif
.kill_sb = kill_litter_super,
#ifdef CONFIG_SHMEM
- .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
#else
.fs_flags = FS_USERNS_MOUNT,
#endif
@@ -4236,6 +4598,14 @@ void __init shmem_init(void)
shmem_init_inodecache();
+#ifdef CONFIG_TMPFS_QUOTA
+ error = register_quota_format(&shmem_quota_format);
+ if (error < 0) {
+ pr_err("Could not register quota format\n");
+ goto out3;
+ }
+#endif
+
error = register_filesystem(&shmem_fs_type);
if (error) {
pr_err("Could not register tmpfs\n");
@@ -4260,6 +4630,10 @@ void __init shmem_init(void)
out1:
unregister_filesystem(&shmem_fs_type);
out2:
+#ifdef CONFIG_TMPFS_QUOTA
+ unregister_quota_format(&shmem_quota_format);
+out3:
+#endif
shmem_destroy_inodecache();
shm_mnt = ERR_PTR(error);
}
@@ -4379,10 +4753,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
+ return inode ? inode : ERR_PTR(-ENOSPC);
+}
+
#endif /* CONFIG_SHMEM */
/* common code */
@@ -4407,9 +4787,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
- if (unlikely(!inode)) {
+
+ if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
- return ERR_PTR(-ENOSPC);
+ return ERR_CAST(inode);
}
inode->i_flags |= i_flags;
inode->i_size = size;
diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c
new file mode 100644
index 000000000000..062d1c1097ae
--- /dev/null
+++ b/mm/shmem_quota.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * In memory quota format relies on quota infrastructure to store dquot
+ * information for us. While conventional quota formats for file systems
+ * with persistent storage can load quota information into dquot from the
+ * storage on-demand and hence quota dquot shrinker can free any dquot
+ * that is not currently being used, it must be avoided here. Otherwise we
+ * can lose valuable information, user provided limits, because there is
+ * no persistent storage to load the information from afterwards.
+ *
+ * One information that in-memory quota format needs to keep track of is
+ * a sorted list of ids for each quota type. This is done by utilizing
+ * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
+ * type.
+ *
+ * This format can be used to support quota on file system without persistent
+ * storage such as tmpfs.
+ *
+ * Author: Lukas Czerner <lczerner@redhat.com>
+ * Carlos Maiolino <cmaiolino@redhat.com>
+ *
+ * Copyright (C) 2023 Red Hat, Inc.
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/quotaops.h>
+#include <linux/quota.h>
+
+#ifdef CONFIG_TMPFS_QUOTA
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define SHMEM_MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#define SHMEM_MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+
+struct quota_id {
+ struct rb_node node;
+ qid_t id;
+ qsize_t bhardlimit;
+ qsize_t bsoftlimit;
+ qsize_t ihardlimit;
+ qsize_t isoftlimit;
+};
+
+static int shmem_check_quota_file(struct super_block *sb, int type)
+{
+ /* There is no real quota file, nothing to do */
+ return 1;
+}
+
+/*
+ * There is no real quota file. Just allocate rb_root for quota ids and
+ * set limits
+ */
+static int shmem_read_file_info(struct super_block *sb, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct mem_dqinfo *info = &dqopt->info[type];
+
+ info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
+ if (!info->dqi_priv)
+ return -ENOMEM;
+
+ info->dqi_max_spc_limit = SHMEM_QUOTA_MAX_SPC_LIMIT;
+ info->dqi_max_ino_limit = SHMEM_QUOTA_MAX_INO_LIMIT;
+
+ info->dqi_bgrace = SHMEM_MAX_DQ_TIME;
+ info->dqi_igrace = SHMEM_MAX_IQ_TIME;
+ info->dqi_flags = 0;
+
+ return 0;
+}
+
+static int shmem_write_file_info(struct super_block *sb, int type)
+{
+ /* There is no real quota file, nothing to do */
+ return 0;
+}
+
+/*
+ * Free all the quota_id entries in the rb tree and rb_root.
+ */
+static int shmem_free_file_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
+ struct rb_root *root = info->dqi_priv;
+ struct quota_id *entry;
+ struct rb_node *node;
+
+ info->dqi_priv = NULL;
+ node = rb_first(root);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+ node = rb_next(&entry->node);
+
+ rb_erase(&entry->node, root);
+ kfree(entry);
+ }
+
+ kfree(root);
+ return 0;
+}
+
+static int shmem_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
+ struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+ qid_t id = from_kqid(&init_user_ns, *qid);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct quota_id *entry = NULL;
+ int ret = 0;
+
+ if (!sb_has_quota_active(sb, qid->type))
+ return -ESRCH;
+
+ down_read(&dqopt->dqio_sem);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+
+ if (id < entry->id)
+ node = node->rb_left;
+ else if (id > entry->id)
+ node = node->rb_right;
+ else
+ goto got_next_id;
+ }
+
+ if (!entry) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (id > entry->id) {
+ node = rb_next(&entry->node);
+ if (!node) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ entry = rb_entry(node, struct quota_id, node);
+ }
+
+got_next_id:
+ *qid = make_kqid(&init_user_ns, qid->type, entry->id);
+out_unlock:
+ up_read(&dqopt->dqio_sem);
+ return ret;
+}
+
+/*
+ * Load dquot with limits from existing entry, or create the new entry if
+ * it does not exist.
+ */
+static int shmem_acquire_dquot(struct dquot *dquot)
+{
+ struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+ struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
+ struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info;
+ struct rb_node *parent = NULL, *new_node = NULL;
+ struct quota_id *new_entry, *entry;
+ qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ int ret = 0;
+
+ mutex_lock(&dquot->dq_lock);
+
+ down_write(&dqopt->dqio_sem);
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct quota_id, node);
+
+ if (id < entry->id)
+ n = &(*n)->rb_left;
+ else if (id > entry->id)
+ n = &(*n)->rb_right;
+ else
+ goto found;
+ }
+
+ /* We don't have entry for this id yet, create it */
+ new_entry = kzalloc(sizeof(struct quota_id), GFP_NOFS);
+ if (!new_entry) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ new_entry->id = id;
+ if (dquot->dq_id.type == USRQUOTA) {
+ new_entry->bhardlimit = sbinfo->qlimits.usrquota_bhardlimit;
+ new_entry->ihardlimit = sbinfo->qlimits.usrquota_ihardlimit;
+ } else if (dquot->dq_id.type == GRPQUOTA) {
+ new_entry->bhardlimit = sbinfo->qlimits.grpquota_bhardlimit;
+ new_entry->ihardlimit = sbinfo->qlimits.grpquota_ihardlimit;
+ }
+
+ new_node = &new_entry->node;
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
+ entry = new_entry;
+
+found:
+ /* Load the stored limits from the tree */
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot->dq_dqb.dqb_bhardlimit = entry->bhardlimit;
+ dquot->dq_dqb.dqb_bsoftlimit = entry->bsoftlimit;
+ dquot->dq_dqb.dqb_ihardlimit = entry->ihardlimit;
+ dquot->dq_dqb.dqb_isoftlimit = entry->isoftlimit;
+
+ if (!dquot->dq_dqb.dqb_bhardlimit &&
+ !dquot->dq_dqb.dqb_bsoftlimit &&
+ !dquot->dq_dqb.dqb_ihardlimit &&
+ !dquot->dq_dqb.dqb_isoftlimit)
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ spin_unlock(&dquot->dq_dqb_lock);
+
+ /* Make sure flags update is visible after dquot has been filled */
+ smp_mb__before_atomic();
+ set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_unlock:
+ up_write(&dqopt->dqio_sem);
+ mutex_unlock(&dquot->dq_lock);
+ return ret;
+}
+
+static bool shmem_is_empty_dquot(struct dquot *dquot)
+{
+ struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info;
+ qsize_t bhardlimit;
+ qsize_t ihardlimit;
+
+ if (dquot->dq_id.type == USRQUOTA) {
+ bhardlimit = sbinfo->qlimits.usrquota_bhardlimit;
+ ihardlimit = sbinfo->qlimits.usrquota_ihardlimit;
+ } else if (dquot->dq_id.type == GRPQUOTA) {
+ bhardlimit = sbinfo->qlimits.grpquota_bhardlimit;
+ ihardlimit = sbinfo->qlimits.grpquota_ihardlimit;
+ }
+
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+ (dquot->dq_dqb.dqb_curspace == 0 &&
+ dquot->dq_dqb.dqb_curinodes == 0 &&
+ dquot->dq_dqb.dqb_bhardlimit == bhardlimit &&
+ dquot->dq_dqb.dqb_ihardlimit == ihardlimit))
+ return true;
+
+ return false;
+}
+/*
+ * Store limits from dquot in the tree unless it's fake. If it is fake
+ * remove the id from the tree since there is no useful information in
+ * there.
+ */
+static int shmem_release_dquot(struct dquot *dquot)
+{
+ struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+ struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+ qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ struct quota_id *entry = NULL;
+
+ mutex_lock(&dquot->dq_lock);
+ /* Check whether we are not racing with some other dqget() */
+ if (dquot_is_busy(dquot))
+ goto out_dqlock;
+
+ down_write(&dqopt->dqio_sem);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+
+ if (id < entry->id)
+ node = node->rb_left;
+ else if (id > entry->id)
+ node = node->rb_right;
+ else
+ goto found;
+ }
+
+ /* We should always find the entry in the rb tree */
+ WARN_ONCE(1, "quota id %u from dquot %p, not in rb tree!\n", id, dquot);
+ up_write(&dqopt->dqio_sem);
+ mutex_unlock(&dquot->dq_lock);
+ return -ENOENT;
+
+found:
+ if (shmem_is_empty_dquot(dquot)) {
+ /* Remove entry from the tree */
+ rb_erase(&entry->node, info->dqi_priv);
+ kfree(entry);
+ } else {
+ /* Store the limits in the tree */
+ spin_lock(&dquot->dq_dqb_lock);
+ entry->bhardlimit = dquot->dq_dqb.dqb_bhardlimit;
+ entry->bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit;
+ entry->ihardlimit = dquot->dq_dqb.dqb_ihardlimit;
+ entry->isoftlimit = dquot->dq_dqb.dqb_isoftlimit;
+ spin_unlock(&dquot->dq_dqb_lock);
+ }
+
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+ up_write(&dqopt->dqio_sem);
+
+out_dqlock:
+ mutex_unlock(&dquot->dq_lock);
+ return 0;
+}
+
+static int shmem_mark_dquot_dirty(struct dquot *dquot)
+{
+ return 0;
+}
+
+static int shmem_dquot_write_info(struct super_block *sb, int type)
+{
+ return 0;
+}
+
+static const struct quota_format_ops shmem_format_ops = {
+ .check_quota_file = shmem_check_quota_file,
+ .read_file_info = shmem_read_file_info,
+ .write_file_info = shmem_write_file_info,
+ .free_file_info = shmem_free_file_info,
+};
+
+struct quota_format_type shmem_quota_format = {
+ .qf_fmt_id = QFMT_SHMEM,
+ .qf_ops = &shmem_format_ops,
+ .qf_owner = THIS_MODULE
+};
+
+const struct dquot_operations shmem_quota_operations = {
+ .acquire_dquot = shmem_acquire_dquot,
+ .release_dquot = shmem_release_dquot,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
+ .write_info = shmem_dquot_write_info,
+ .mark_dirty = shmem_mark_dquot_dirty,
+ .get_next_id = shmem_get_next_id,
+};
+#endif /* CONFIG_TMPFS_QUOTA */
diff --git a/mm/slab.c b/mm/slab.c
index 88194391d553..9ad3d0f2d1a5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1670,7 +1670,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
} else {
- freelist_cache = kmalloc_slab(freelist_size, 0u);
+ freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_);
if (!freelist_cache)
continue;
freelist_cache_size = freelist_cache->size;
diff --git a/mm/slab.h b/mm/slab.h
index 9c0e09d0f81f..799a315695c6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -282,7 +282,7 @@ void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
-struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller);
void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
int node, size_t orig_size,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d1555ea2981a..cd71f9581e67 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -678,6 +678,11 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+unsigned long random_kmalloc_seed __ro_after_init;
+EXPORT_SYMBOL(random_kmalloc_seed);
+#endif
+
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -720,7 +725,7 @@ static inline unsigned int size_index_elem(unsigned int bytes)
* Find the kmem_cache structure that serves a given size of
* allocation
*/
-struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
{
unsigned int index;
@@ -735,7 +740,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
index = fls(size - 1);
}
- return kmalloc_caches[kmalloc_type(flags)][index];
+ return kmalloc_caches[kmalloc_type(flags, caller)][index];
}
size_t kmalloc_size_roundup(size_t size)
@@ -752,8 +757,11 @@ size_t kmalloc_size_roundup(size_t size)
if (size > KMALLOC_MAX_CACHE_SIZE)
return PAGE_SIZE << get_order(size);
- /* The flags don't matter since size_index is common to all. */
- c = kmalloc_slab(size, GFP_KERNEL);
+ /*
+ * The flags don't matter since size_index is common to all.
+ * Neither does the caller for just getting ->object_size.
+ */
+ c = kmalloc_slab(size, GFP_KERNEL, 0);
return c ? c->object_size : 0;
}
EXPORT_SYMBOL(kmalloc_size_roundup);
@@ -776,12 +784,35 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
#define KMALLOC_RCL_NAME(sz)
#endif
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
+#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
+#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
+#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
+#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
+#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
+#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
+#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
+#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
+#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
+#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
+#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
+#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
+#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
+#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
+#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
+#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
+#else // CONFIG_RANDOM_KMALLOC_CACHES
+#define KMALLOC_RANDOM_NAME(N, sz)
+#endif
+
#define INIT_KMALLOC_INFO(__size, __short_size) \
{ \
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
KMALLOC_RCL_NAME(__short_size) \
KMALLOC_CGROUP_NAME(__short_size) \
KMALLOC_DMA_NAME(__short_size) \
+ KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
.size = __size, \
}
@@ -864,10 +895,9 @@ void __init setup_kmalloc_cache_index_table(void)
static unsigned int __kmalloc_minalign(void)
{
-#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC
- if (io_tlb_default_mem.nslabs)
+ if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
+ is_swiotlb_allocated())
return ARCH_KMALLOC_MINALIGN;
-#endif
return dma_get_cache_alignment();
}
@@ -890,6 +920,11 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
flags |= SLAB_CACHE_DMA;
}
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+ if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
+ flags |= SLAB_NO_MERGE;
+#endif
+
/*
* If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
* KMALLOC_NORMAL caches.
@@ -941,6 +976,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
new_kmalloc_cache(2, type, flags);
}
}
+#ifdef CONFIG_RANDOM_KMALLOC_CACHES
+ random_kmalloc_seed = get_random_u64();
+#endif
/* Kmalloc array is now usable */
slab_state = UP;
@@ -976,7 +1014,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
return ret;
}
- s = kmalloc_slab(size, flags);
+ s = kmalloc_slab(size, flags, caller);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
diff --git a/mm/slub.c b/mm/slub.c
index e3b5d5c0eb3a..f7940048138c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -361,43 +361,51 @@ static struct workqueue_struct *flushwq;
*******************************************************************/
/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
+/*
* Returns freelist pointer (ptr). With hardening, this is obfuscated
* with an XOR of the address where the pointer is held and a per-cache
* random number.
*/
-static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
- unsigned long ptr_addr)
+static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
+ void *ptr, unsigned long ptr_addr)
{
+ unsigned long encoded;
+
#ifdef CONFIG_SLAB_FREELIST_HARDENED
- /*
- * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
- * Normally, this doesn't cause any issues, as both set_freepointer()
- * and get_freepointer() are called with a pointer with the same tag.
- * However, there are some issues with CONFIG_SLUB_DEBUG code. For
- * example, when __free_slub() iterates over objects in a cache, it
- * passes untagged pointers to check_object(). check_object() in turns
- * calls get_freepointer() with an untagged pointer, which causes the
- * freepointer to be restored incorrectly.
- */
- return (void *)((unsigned long)ptr ^ s->random ^
- swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
+ encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
#else
- return ptr;
+ encoded = (unsigned long)ptr;
#endif
+ return (freeptr_t){.v = encoded};
}
-/* Returns the freelist pointer recorded at location ptr_addr. */
-static inline void *freelist_dereference(const struct kmem_cache *s,
- void *ptr_addr)
+static inline void *freelist_ptr_decode(const struct kmem_cache *s,
+ freeptr_t ptr, unsigned long ptr_addr)
{
- return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
- (unsigned long)ptr_addr);
+ void *decoded;
+
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
+#else
+ decoded = (void *)ptr.v;
+#endif
+ return decoded;
}
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
+ unsigned long ptr_addr;
+ freeptr_t p;
+
object = kasan_reset_tag(object);
- return freelist_dereference(s, object + s->offset);
+ ptr_addr = (unsigned long)object + s->offset;
+ p = *(freeptr_t *)(ptr_addr);
+ return freelist_ptr_decode(s, p, ptr_addr);
}
#ifndef CONFIG_SLUB_TINY
@@ -421,15 +429,15 @@ __no_kmsan_checks
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
unsigned long freepointer_addr;
- void *p;
+ freeptr_t p;
if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
- copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
- return freelist_ptr(s, p, freepointer_addr);
+ copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
+ return freelist_ptr_decode(s, p, freepointer_addr);
}
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
@@ -441,7 +449,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
#endif
freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
- *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
+ *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
}
/* Loop over all objects in a slab */
diff --git a/mm/truncate.c b/mm/truncate.c
index bd4fafd67f95..8e3aa9e8618e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -655,11 +655,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
- if (folio->mapping != mapping) {
+ if (unlikely(folio->mapping != mapping)) {
folio_unlock(folio);
continue;
}
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
if (folio_mapped(folio))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0fc69efa4f1f..96d9eae5c7cc 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -86,7 +86,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (page_in_cache && !vm_shared)
writable = false;
if (writable)
- _dst_pte = pte_mkwrite(_dst_pte);
+ _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);
diff --git a/mm/util.c b/mm/util.c
index f31e2ca62cfa..f08b655da917 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -396,7 +396,10 @@ static int mmap_is_legacy(struct rlimit *rlim_stack)
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ /* On parisc the stack always grows up - so a unlimited stack should
+ * not be an indicator to use the legacy memory layout. */
+ if (rlim_stack->rlim_cur == RLIM_INFINITY &&
+ !IS_ENABLED(CONFIG_STACK_GROWSUP))
return 1;
return sysctl_legacy_va_layout;
@@ -540,7 +543,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+ ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
&uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index b52644771cc4..22c6689d9302 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -244,6 +244,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
if (mem_cgroup_disabled())
return;
+ /*
+ * The in-kernel users only care about the reclaim efficiency
+ * for this @memcg rather than the whole subtree, and there
+ * isn't and won't be any in-kernel user in a legacy cgroup.
+ */
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree)
+ return;
+
vmpr = memcg_to_vmpressure(memcg);
/*