From 96c84dde362a3e4ff67a12eaac2ea6e88e963c07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:35:30 -0700 Subject: mm: don't include in Not required at all, and having this causes a huge kernel rebuild as soon as something in dax.h changes. Link: https://lkml.kernel.org/r/20210921082253.1859794-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Naoya Horiguchi Reviewed-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memory-failure.c') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bdbbb32211a5..8376cfe0efce 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include -- cgit From ba9eb3cef9e699e259f9ceefdbcd3ee83d3529e2 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 5 Nov 2021 13:41:01 -0700 Subject: mm/memory_failure: constify static mm_walk_ops The only usage of hwp_walk_ops is to pass its address to walk_page_range() which takes a pointer to const mm_walk_ops as argument. Make it const to allow the compiler to put it in read-only memory. Link: https://lkml.kernel.org/r/20211014075042.17174-3-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Acked-by: Naoya Horiguchi Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory-failure.c') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8376cfe0efce..54a5d28ea601 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -674,7 +674,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif -static struct mm_walk_ops hwp_walk_ops = { +static const struct mm_walk_ops hwp_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, }; -- cgit From dd0f230a0a80ff396c7ce587f16429f2a8131344 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:07 -0700 Subject: mm: hwpoison: refactor refcount check handling Memory failure will report failure if the page still has extra pinned refcount other than from hwpoison after the handler is done. Actually the check is not necessary for all handlers, so move the check into specific handlers. This would make the following keeping shmem page in page cache patch easier. There may be expected extra pin for some cases, for example, when the page is dirty and in swapcache. Link: https://lkml.kernel.org/r/20211020210755.23964-5-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Naoya Horiguchi Suggested-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 93 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 29 deletions(-) (limited to 'mm/memory-failure.c') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 54a5d28ea601..bbd433e96790 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -807,12 +807,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn, return ret; } +struct page_state { + unsigned long mask; + unsigned long res; + enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ + int (*action)(struct page_state *ps, struct page *p); +}; + +/* + * Return true if page is still referenced by others, otherwise return + * false. + * + * The extra_pins is true when one extra refcount is expected. + */ +static bool has_extra_refcount(struct page_state *ps, struct page *p, + bool extra_pins) +{ + int count = page_count(p) - 1; + + if (extra_pins) + count -= 1; + + if (count > 0) { + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + page_to_pfn(p), action_page_types[ps->type], count); + return true; + } + + return false; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we * could be more sophisticated. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int me_kernel(struct page_state *ps, struct page *p) { unlock_page(p); return MF_IGNORED; @@ -821,9 +853,9 @@ static int me_kernel(struct page *p, unsigned long pfn) /* * Page in unknown state. Do nothing. */ -static int me_unknown(struct page *p, unsigned long pfn) +static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -831,7 +863,7 @@ static int me_unknown(struct page *p, unsigned long pfn) /* * Clean (or cleaned) page cache page. */ -static int me_pagecache_clean(struct page *p, unsigned long pfn) +static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; @@ -868,9 +900,13 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, page_to_pfn(p), mapping); out: unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -879,7 +915,7 @@ out: * Issues: when the error hit a hole page the error is not properly * propagated. */ -static int me_pagecache_dirty(struct page *p, unsigned long pfn) +static int me_pagecache_dirty(struct page_state *ps, struct page *p) { struct address_space *mapping = page_mapping(p); @@ -923,7 +959,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) mapping_set_error(mapping, -EIO); } - return me_pagecache_clean(p, pfn); + return me_pagecache_clean(ps, p); } /* @@ -945,9 +981,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * Clean swap cache pages can be directly isolated. A later page fault will * bring in the known good data from disk. */ -static int me_swapcache_dirty(struct page *p, unsigned long pfn) +static int me_swapcache_dirty(struct page_state *ps, struct page *p) { int ret; + bool extra_pins = false; ClearPageDirty(p); /* Trigger EIO in shmem: */ @@ -955,10 +992,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; unlock_page(p); + + if (ret == MF_DELAYED) + extra_pins = true; + + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + return ret; } -static int me_swapcache_clean(struct page *p, unsigned long pfn) +static int me_swapcache_clean(struct page_state *ps, struct page *p) { int ret; @@ -966,6 +1010,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -975,7 +1023,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. */ -static int me_huge_page(struct page *p, unsigned long pfn) +static int me_huge_page(struct page_state *ps, struct page *p) { int res; struct page *hpage = compound_head(p); @@ -986,7 +1034,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { - res = truncate_error_page(hpage, pfn, mapping); + res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { res = MF_FAILED; @@ -1004,6 +1052,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) } } + if (has_extra_refcount(ps, p, false)) + res = MF_FAILED; + return res; } @@ -1029,14 +1080,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) -static struct page_state { - unsigned long mask; - unsigned long res; - enum mf_action_page_type type; - - /* Callback ->action() has to unlock the relevant page inside it. */ - int (*action)(struct page *p, unsigned long pfn); -} error_states[] = { +static struct page_state error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: @@ -1096,19 +1140,10 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn) { int result; - int count; /* page p should be unlocked after returning from ps->action(). */ - result = ps->action(p, pfn); + result = ps->action(ps, p); - count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == MF_DELAYED) - count--; - if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", - pfn, action_page_types[ps->type], count); - result = MF_FAILED; - } action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ -- cgit From b9d02f1bdd98f38e6e5ecacc9786a8f58f3f8b2c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:10 -0700 Subject: mm: shmem: don't truncate page if memory failure happens The current behavior of memory failure is to truncate the page cache regardless of dirty or clean. If the page is dirty the later access will get the obsolete data from disk without any notification to the users. This may cause silent data loss. It is even worse for shmem since shmem is in-memory filesystem, truncating page cache means discarding data blocks. The later read would return all zero. The right approach is to keep the corrupted page in page cache, any later access would return error for syscalls or SIGBUS for page fault, until the file is truncated, hole punched or removed. The regular storage backed filesystems would be more complicated so this patch is focused on shmem. This also unblock the support for soft offlining shmem THP. [arnd@arndb.de: fix uninitialized variable use in me_pagecache_clean()] Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm/memory-failure.c') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bbd433e96790..952a41d568c4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -867,6 +868,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -895,18 +897,24 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) goto out; } + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } -- cgit From 4966455d9100236fd6dd72b0cd00818435fdb25d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:14 -0700 Subject: mm: hwpoison: handle non-anonymous THP correctly Currently hwpoison doesn't handle non-anonymous THP, but since v4.8 THP support for tmpfs and read-only file cache has been added. They could be offlined by split THP, just like anonymous THP. Link: https://lkml.kernel.org/r/20211020210755.23964-7-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/memory-failure.c') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 952a41d568c4..f38b7b42a508 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1444,14 +1444,11 @@ static int identify_page_state(unsigned long pfn, struct page *p, static int try_to_split_thp_page(struct page *page, const char *msg) { lock_page(page); - if (!PageAnon(page) || unlikely(split_huge_page(page))) { + if (unlikely(split_huge_page(page))) { unsigned long pfn = page_to_pfn(page); unlock_page(page); - if (!PageAnon(page)) - pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); - else - pr_info("%s: %#lx: thp split failed\n", msg, pfn); + pr_info("%s: %#lx: thp split failed\n", msg, pfn); put_page(page); return -EBUSY; } -- cgit