summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c3
-rw-r--r--mm/filemap.c181
-rw-r--r--mm/kasan/hw_tags.c8
-rw-r--r--mm/kasan/kasan.h22
-rw-r--r--mm/kfence/kfence_test.c13
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c10
-rw-r--r--mm/memcontrol.c26
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c41
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/truncate.c9
-rw-r--r--mm/util.c19
-rw-r--r--mm/vmstat.c12
19 files changed, 254 insertions, 131 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6122c78ce914..4a9d4e27d0d9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -818,6 +818,7 @@ struct backing_dev_info *bdi_alloc(int node_id)
bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = VM_READAHEAD_PAGES;
+ timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
return bdi;
}
EXPORT_SYMBOL(bdi_alloc);
@@ -939,6 +940,8 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
+ del_timer_sync(&bdi->laptop_mode_wb_timer);
+
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4926f16ec52d..dae481293b5d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -76,8 +76,9 @@
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
- * ->i_mutex
- * ->i_mmap_rwsem (truncate->unmap_mapping_range)
+ * ->i_rwsem
+ * ->invalidate_lock (acquired by fs in truncate path)
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_lock
* ->i_mmap_rwsem
@@ -85,9 +86,10 @@
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_lock
- * ->lock_page (access_process_vm)
+ * ->invalidate_lock (filemap_fault)
+ * ->lock_page (filemap_fault, access_process_vm)
*
- * ->i_mutex (generic_perform_write)
+ * ->i_rwsem (generic_perform_write)
* ->mmap_lock (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
@@ -376,6 +378,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
}
/**
+ * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @wbc: the writeback_control controlling the writeout
+ *
+ * Call writepages on the mapping using the provided wbc to control the
+ * writeout.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int ret;
+
+ if (!mapping_can_writeback(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ wbc_attach_fdatawrite_inode(wbc, mapping->host);
+ ret = do_writepages(mapping, wbc);
+ wbc_detach_inode(wbc);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+
+/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
@@ -395,7 +423,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
{
- int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = LONG_MAX,
@@ -403,14 +430,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_can_writeback(mapping) ||
- !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- wbc_attach_fdatawrite_inode(&wbc, mapping->host);
- ret = do_writepages(mapping, &wbc);
- wbc_detach_inode(&wbc);
- return ret;
+ return filemap_fdatawrite_wbc(mapping, &wbc);
}
static inline int __filemap_fdatawrite(struct address_space *mapping,
@@ -1005,6 +1025,44 @@ EXPORT_SYMBOL(__page_cache_alloc);
#endif
/*
+ * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ *
+ * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to lock
+ * @mapping2: the second mapping to lock
+ */
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1 > mapping2)
+ swap(mapping1, mapping2);
+ if (mapping1)
+ down_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ down_write_nested(&mapping2->invalidate_lock, 1);
+}
+EXPORT_SYMBOL(filemap_invalidate_lock_two);
+
+/*
+ * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ *
+ * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to unlock
+ * @mapping2: the second mapping to unlock
+ */
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1)
+ up_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ up_write(&mapping2->invalidate_lock);
+}
+EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+
+/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
* waitqueues where the bucket discipline is to maintain all
@@ -2365,20 +2423,30 @@ static int filemap_update_page(struct kiocb *iocb,
{
int error;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!filemap_invalidate_trylock_shared(mapping))
+ return -EAGAIN;
+ } else {
+ filemap_invalidate_lock_shared(mapping);
+ }
+
if (!trylock_page(page)) {
+ error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
- return -EAGAIN;
+ goto unlock_mapping;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
+ filemap_invalidate_unlock_shared(mapping);
put_and_wait_on_page_locked(page, TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
error = __lock_page_async(page, iocb->ki_waitq);
if (error)
- return error;
+ goto unlock_mapping;
}
+ error = AOP_TRUNCATED_PAGE;
if (!page->mapping)
- goto truncated;
+ goto unlock;
error = 0;
if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
@@ -2389,15 +2457,13 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock;
error = filemap_read_page(iocb->ki_filp, mapping, page);
- if (error == AOP_TRUNCATED_PAGE)
- put_page(page);
- return error;
-truncated:
- unlock_page(page);
- put_page(page);
- return AOP_TRUNCATED_PAGE;
+ goto unlock_mapping;
unlock:
unlock_page(page);
+unlock_mapping:
+ filemap_invalidate_unlock_shared(mapping);
+ if (error == AOP_TRUNCATED_PAGE)
+ put_page(page);
return error;
}
@@ -2412,6 +2478,19 @@ static int filemap_create_page(struct file *file,
if (!page)
return -ENOMEM;
+ /*
+ * Protect against truncate / hole punch. Grabbing invalidate_lock here
+ * assures we cannot instantiate and bring uptodate new pagecache pages
+ * after evicting page cache during truncate and before actually
+ * freeing blocks. Note that we could release invalidate_lock after
+ * inserting the page into page cache as the locked page would then be
+ * enough to synchronize with hole punching. But there are code paths
+ * such as filemap_update_page() filling in partially uptodate pages or
+ * ->readpages() that need to hold invalidate_lock while mapping blocks
+ * for IO so let's hold the lock here as well to keep locking rules
+ * simple.
+ */
+ filemap_invalidate_lock_shared(mapping);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
@@ -2423,9 +2502,11 @@ static int filemap_create_page(struct file *file,
if (error)
goto error;
+ filemap_invalidate_unlock_shared(mapping);
pagevec_add(pvec, page);
return 0;
error:
+ filemap_invalidate_unlock_shared(mapping);
put_page(page);
return error;
}
@@ -2964,6 +3045,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
pgoff_t max_off;
struct page *page;
vm_fault_t ret = 0;
+ bool mapping_locked = false;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
@@ -2973,25 +3055,39 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
- if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ if (likely(page)) {
/*
- * We found the page, so try async readahead before
- * waiting for the lock.
+ * We found the page, so try async readahead before waiting for
+ * the lock.
*/
- fpin = do_async_mmap_readahead(vmf, page);
- } else if (!page) {
+ if (!(vmf->flags & FAULT_FLAG_TRIED))
+ fpin = do_async_mmap_readahead(vmf, page);
+ if (unlikely(!PageUptodate(page))) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
+ } else {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
+ /*
+ * See comment in filemap_create_page() why we need
+ * invalidate_lock
+ */
+ if (!mapping_locked) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
page = pagecache_get_page(mapping, offset,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
if (!page) {
if (fpin)
goto out_retry;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_OOM;
}
}
@@ -3011,8 +3107,20 @@ retry_find:
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
- if (unlikely(!PageUptodate(page)))
+ if (unlikely(!PageUptodate(page))) {
+ /*
+ * The page was in cache and uptodate and now it is not.
+ * Strange but possible since we didn't hold the page lock all
+ * the time. Let's drop everything get the invalidate lock and
+ * try again.
+ */
+ if (!mapping_locked) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
goto page_not_uptodate;
+ }
/*
* We've made it this far and we had to drop our mmap_lock, now is the
@@ -3023,6 +3131,8 @@ retry_find:
unlock_page(page);
goto out_retry;
}
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
/*
* Found the page and have a reference on it.
@@ -3053,6 +3163,7 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_SIGBUS;
@@ -3064,6 +3175,8 @@ out_retry:
*/
if (page)
put_page(page);
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
@@ -3434,6 +3547,8 @@ out:
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
@@ -3457,6 +3572,8 @@ EXPORT_SYMBOL(read_cache_page);
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
@@ -3701,12 +3818,12 @@ EXPORT_SYMBOL(generic_perform_write);
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
- * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_mutex.
+ * avoid syncing under i_rwsem.
*
* Return:
* * number of bytes written, even for truncated writes
@@ -3794,7 +3911,7 @@ EXPORT_SYMBOL(__generic_file_write_iter);
*
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_mutex as needed.
+ * and acquires i_rwsem as needed.
* Return:
* * negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 51903639e55f..05d1e9460e2e 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -115,8 +115,6 @@ void kasan_init_hw_tags_cpu(void)
if (kasan_arg == KASAN_ARG_OFF)
return;
- hw_init_tags(KASAN_TAG_MAX);
-
/*
* Enable async mode only when explicitly requested through
* the command line.
@@ -207,12 +205,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_set_tagging_report_once(bool state)
-{
- hw_set_tagging_report_once(state);
-}
-EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once);
-
void kasan_enable_tagging_sync(void)
{
hw_enable_tagging_sync();
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index fa02c88b6948..8bf568a80eb8 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -3,6 +3,7 @@
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
+#include <linux/kasan-tags.h>
#include <linux/kfence.h>
#include <linux/stackdepot.h>
@@ -50,16 +51,6 @@ extern bool kasan_flag_async __ro_after_init;
#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT)
-#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
-#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
-#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
-
-#ifdef CONFIG_KASAN_HW_TAGS
-#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */
-#else
-#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */
-#endif
-
#ifdef CONFIG_KASAN_GENERIC
#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
@@ -298,12 +289,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_enable_tagging_async
#define arch_enable_tagging_async()
#endif
-#ifndef arch_init_tags
-#define arch_init_tags(max_tag)
-#endif
-#ifndef arch_set_tagging_report_once
-#define arch_set_tagging_report_once(state)
-#endif
#ifndef arch_force_async_tag_fault
#define arch_force_async_tag_fault()
#endif
@@ -319,8 +304,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging_sync() arch_enable_tagging_sync()
#define hw_enable_tagging_async() arch_enable_tagging_async()
-#define hw_init_tags(max_tag) arch_init_tags(max_tag)
-#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state)
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
@@ -331,19 +314,16 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging_sync()
#define hw_enable_tagging_async()
-#define hw_set_tagging_report_once(state)
#endif /* CONFIG_KASAN_HW_TAGS */
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_set_tagging_report_once(bool state);
void kasan_enable_tagging_sync(void);
void kasan_force_async_fault(void);
#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
-static inline void kasan_set_tagging_report_once(bool state) { }
static inline void kasan_enable_tagging_sync(void) { }
static inline void kasan_force_async_fault(void) { }
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 942cbc16ad26..eb6307c199ea 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -23,8 +23,15 @@
#include <linux/tracepoint.h>
#include <trace/events/printk.h>
+#include <asm/kfence.h>
+
#include "kfence.h"
+/* May be overridden by <asm/kfence.h>. */
+#ifndef arch_kfence_test_address
+#define arch_kfence_test_address(addr) (addr)
+#endif
+
/* Report as observed from console. */
static struct {
spinlock_t lock;
@@ -82,6 +89,7 @@ static const char *get_access_type(const struct expect_report *r)
/* Check observed report matches information in @r. */
static bool report_matches(const struct expect_report *r)
{
+ unsigned long addr = (unsigned long)r->addr;
bool ret = false;
unsigned long flags;
typeof(observed.lines) expect;
@@ -131,22 +139,25 @@ static bool report_matches(const struct expect_report *r)
switch (r->type) {
case KFENCE_ERROR_OOB:
cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
+ addr = arch_kfence_test_address(addr);
break;
case KFENCE_ERROR_UAF:
cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
+ addr = arch_kfence_test_address(addr);
break;
case KFENCE_ERROR_CORRUPTION:
cur += scnprintf(cur, end - cur, "Corrupted memory at");
break;
case KFENCE_ERROR_INVALID:
cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
+ addr = arch_kfence_test_address(addr);
break;
case KFENCE_ERROR_INVALID_FREE:
cur += scnprintf(cur, end - cur, "Invalid free of");
break;
}
- cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr);
+ cur += scnprintf(cur, end - cur, " 0x%p", (void *)addr);
spin_lock_irqsave(&observed.lock, flags);
if (!report_available())
diff --git a/mm/madvise.c b/mm/madvise.c
index 4a15a83031f6..0734db8d53a7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -912,7 +912,7 @@ static long madvise_remove(struct vm_area_struct *vma,
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/*
- * Filesystem's fallocate may need to take i_mutex. We need to
+ * Filesystem's fallocate may need to take i_rwsem. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
diff --git a/mm/memblock.c b/mm/memblock.c
index e6b4654f9dfd..0ab5a749bfa6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -665,6 +665,11 @@ repeat:
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__,
+ &base, &end, nid, (void *)_RET_IP_);
+
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
@@ -1668,6 +1673,11 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
if (!size)
return;
+ if (memblock.memory.cnt <= 1) {
+ pr_warn("%s: No memory registered yet\n", __func__);
+ return;
+ }
+
ret = memblock_isolate_range(&memblock.memory, base, size,
&start_rgn, &end_rgn);
if (ret)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 896f0f403c52..b762215d73eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -941,7 +941,7 @@ static __always_inline bool memcg_kmem_bypass(void)
return false;
/* Memcg to charge can't be determined. */
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
return true;
return false;
@@ -7027,14 +7027,14 @@ void mem_cgroup_sk_free(struct sock *sk)
* mem_cgroup_charge_skmem - charge socket memory
* @memcg: memcg to charge
* @nr_pages: number of pages to charge
+ * @gfp_mask: reclaim mode
*
* Charges @nr_pages to @memcg. Returns %true if the charge fit within
- * @memcg's configured limit, %false if the charge had to be forced.
+ * @memcg's configured limit, %false if it doesn't.
*/
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
- gfp_t gfp_mask = GFP_KERNEL;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
struct page_counter *fail;
@@ -7042,21 +7042,19 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
memcg->tcpmem_pressure = 0;
return true;
}
- page_counter_charge(&memcg->tcpmem, nr_pages);
memcg->tcpmem_pressure = 1;
+ if (gfp_mask & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ return true;
+ }
return false;
}
- /* Don't block in the packet receive path */
- if (in_softirq())
- gfp_mask = GFP_NOWAIT;
-
- mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
-
- if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+ if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+ mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
+ }
- try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
return false;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 517789b03961..54879c339024 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -865,7 +865,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Truncation is a bit tricky. Enable it per file system for now.
*
- * Open: to take i_mutex or not for this? Right now we don't.
+ * Open: to take i_rwsem or not for this? Right now we don't.
*/
ret = truncate_error_page(p, pfn, mapping);
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 52fed230dc21..dce46105e3df 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1518,12 +1518,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
- /*
- * Make sure there are no mandatory locks on the file.
- */
- if (locks_verify_locked(file))
- return -EAGAIN;
-
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
diff --git a/mm/nommu.c b/mm/nommu.c
index 3a93d4054810..9d0ad98f838c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -826,9 +826,6 @@ static int validate_mmap_request(struct file *file,
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file))
- return -EAGAIN;
-
if (!(capabilities & NOMMU_MAP_DIRECT))
return -ENODEV;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c3b00c6f30ce..4812a17b288c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2018,7 +2018,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
return ret;
}
-#ifdef CONFIG_BLOCK
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
@@ -2053,7 +2052,6 @@ void laptop_sync_completion(void)
rcu_read_unlock();
}
-#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
diff --git a/mm/readahead.c b/mm/readahead.c
index d589f147f4c2..41b75d76d36e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -192,6 +192,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
*/
unsigned int nofs = memalloc_nofs_save();
+ filemap_invalidate_lock_shared(mapping);
/*
* Preallocate as many pages as we will need.
*/
@@ -236,6 +237,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* will then handle the error.
*/
read_pages(ractl, &page_pool, false);
+ filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
diff --git a/mm/rmap.c b/mm/rmap.c
index b9eb5c12f3fe..2d29a57d29e8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,28 +20,29 @@
/*
* Lock ordering in mm:
*
- * inode->i_mutex (while writing or truncating, not reading or faulting)
+ * inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
- * page->flags PG_locked (lock_page) * (see huegtlbfs below)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- * mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * lock_page_memcg move_lock (in __set_page_dirty_buffers)
- * i_pages lock (widely used)
- * lruvec->lru_lock (in lock_page_lruvec_irq)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * i_pages lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * mapping->invalidate_lock (in filemap_fault)
+ * page->flags PG_locked (lock_page) * (see hugetlbfs below)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * lock_page_memcg move_lock (in __set_page_dirty_buffers)
+ * i_pages lock (widely used)
+ * lruvec->lru_lock (in lock_page_lruvec_irq)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * i_pages lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*
diff --git a/mm/shmem.c b/mm/shmem.c
index 21c29f7e3894..88742953532c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -95,7 +95,7 @@ static struct vfsmount *shm_mnt;
/*
* shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * inode->i_private (with i_mutex making sure that it has only one user at
+ * inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
@@ -799,7 +799,7 @@ static int shmem_free_swap(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given offsets are swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
@@ -831,7 +831,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given vma is swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
@@ -1095,7 +1095,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
@@ -2054,7 +2054,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
/*
* Trinity finds that probing a hole which tmpfs is punching can
* prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_mutex. So refrain from
+ * locks writers out with its hold on i_rwsem. So refrain from
* faulting pages into the hole while it's being punched. Although
* shmem_undo_range() does remove the additions, it may be unable to
* keep up, as each new page needs its own unmap_mapping_range() call,
@@ -2065,7 +2065,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
* we just need to make racing faults a rare case.
*
* The implementation below would be much simpler if we just used a
- * standard mutex or completion: but we cannot take i_mutex in fault,
+ * standard mutex or completion: but we cannot take i_rwsem in fault,
* and bloating every shmem inode for this unlikely case would be sad.
*/
if (unlikely(inode->i_private)) {
@@ -2457,7 +2457,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
- /* i_mutex is held by caller */
+ /* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
@@ -2557,7 +2557,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
/*
* We must evaluate after, since reads (unlike writes)
- * are called without i_mutex protection against truncate
+ * are called without i_rwsem protection against truncate
*/
nr = PAGE_SIZE;
i_size = i_size_read(inode);
@@ -2627,7 +2627,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return -ENXIO;
inode_lock(inode);
- /* We're holding i_mutex so we can access i_size directly */
+ /* We're holding i_rwsem so we can access i_size directly */
offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
@@ -2656,7 +2656,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index a66f3e0ec973..16f706c55d92 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -70,9 +70,9 @@ void disable_swap_slots_cache_lock(void)
swap_slot_cache_enabled = false;
if (swap_slot_cache_initialized) {
/* serialize with cpu hotplug operations */
- get_online_cpus();
+ cpus_read_lock();
__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
- put_online_cpus();
+ cpus_read_unlock();
}
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 787b35f2cdc1..714eaf19821d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -412,7 +412,8 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
*
- * Called under (and serialised by) inode->i_mutex.
+ * Called under (and serialised by) inode->i_rwsem and
+ * mapping->invalidate_lock.
*
* Note: When this function returns, there can be a page in the process of
* deletion (inside __delete_from_page_cache()) in the specified range. Thus
@@ -429,7 +430,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
* truncate_inode_pages_final - truncate *all* pages before inode dies
* @mapping: mapping to truncate
*
- * Called under (and serialized by) inode->i_mutex.
+ * Called under (and serialized by) inode->i_rwsem.
*
* Filesystems have to use this in the .evict_inode path to inform the
* VM that this is the final truncate and the inode is going away.
@@ -746,7 +747,7 @@ EXPORT_SYMBOL(truncate_pagecache);
* setattr function when ATTR_SIZE is passed in.
*
* Must be called with a lock serializing truncates and writes (generally
- * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
* specific block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
@@ -775,7 +776,7 @@ EXPORT_SYMBOL(truncate_setsize);
*
* The function must be called after i_size is updated so that page fault
* coming after we unlock the page will already see the new i_size.
- * The function must be called while we still hold i_mutex - this not only
+ * The function must be called while we still hold i_rwsem - this not only
* makes sure i_size is stable but also that userspace cannot observe new
* i_size value before we are prepared to store mmap writes at new inode size.
*/
diff --git a/mm/util.c b/mm/util.c
index 9043d03750a7..499b6b5767ed 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -593,6 +593,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
+ /* Don't even allow crazy sizes */
+ if (WARN_ON_ONCE(size > INT_MAX))
+ return NULL;
+
return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
}
@@ -635,6 +639,21 @@ void kvfree_sensitive(const void *addr, size_t len)
}
EXPORT_SYMBOL(kvfree_sensitive);
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+{
+ void *newp;
+
+ if (oldsize >= newsize)
+ return (void *)p;
+ newp = kvmalloc(newsize, flags);
+ if (!newp)
+ return NULL;
+ memcpy(newp, p, oldsize);
+ kvfree(p);
+ return newp;
+}
+EXPORT_SYMBOL(kvrealloc);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 13ff25d0d96a..0885a34197b7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -129,9 +129,9 @@ static void sum_vm_events(unsigned long *ret)
*/
void all_vm_events(unsigned long *ret)
{
- get_online_cpus();
+ cpus_read_lock();
sum_vm_events(ret);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -1939,7 +1939,7 @@ static void vmstat_shepherd(struct work_struct *w)
{
int cpu;
- get_online_cpus();
+ cpus_read_lock();
/* Check processors whose vmstat worker threads have been disabled */
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
@@ -1949,7 +1949,7 @@ static void vmstat_shepherd(struct work_struct *w)
cond_resched();
}
- put_online_cpus();
+ cpus_read_unlock();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
@@ -2028,9 +2028,9 @@ void __init init_mm_internals(void)
if (ret < 0)
pr_err("vmstat: failed to register 'online' hotplug state\n");
- get_online_cpus();
+ cpus_read_lock();
init_cpu_node_state();
- put_online_cpus();
+ cpus_read_unlock();
start_shepherd_timer();
#endif