summaryrefslogtreecommitdiff
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c330
1 files changed, 221 insertions, 109 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b864c2eff641..9a7a228ad04a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -33,6 +33,9 @@
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
+
+#define pr_fmt(fmt) "Memory failure: " fmt
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -252,7 +255,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret = 0;
- pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
if ((flags & MF_ACTION_REQUIRED) && (t == current))
@@ -270,7 +273,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
addr_lsb, t); /* synchronous? */
if (ret < 0)
- pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+ pr_info("Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
return ret;
}
@@ -297,10 +300,9 @@ void shake_page(struct page *p)
}
EXPORT_SYMBOL_GPL(shake_page);
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+ unsigned long address)
{
- unsigned long address = vma_address(page, vma);
unsigned long ret = 0;
pgd_t *pgd;
p4d_t *p4d;
@@ -340,23 +342,33 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ * In other cases, such as anonymous and file-backend page, the address to be
+ * killed can be caculated by @p itself.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+ struct list_head *to_kill)
{
struct to_kill *tk;
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
+ pr_err("Out of memory while machine check handling\n");
return;
}
tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
+ if (is_zone_device_page(p)) {
+ /*
+ * Since page->mapping is not used for fsdax, we need
+ * calculate the address based on the vma.
+ */
+ if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+ } else
tk->size_shift = page_shift(compound_head(p));
/*
@@ -370,7 +382,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* has a mapping for the page.
*/
if (tk->addr == -EFAULT) {
- pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+ pr_info("Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
} else if (tk->size_shift == 0) {
kfree(tk);
@@ -403,7 +415,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr == -EFAULT) {
- pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
tk->tsk, PIDTYPE_PID);
@@ -416,7 +428,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* process anyways.
*/
else if (kill_proc(tk, pfn, flags) < 0)
- pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
@@ -505,7 +517,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -541,12 +553,40 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ i_mmap_unlock_read(mapping);
+}
+
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+ struct address_space *mapping, pgoff_t pgoff,
+ struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct task_struct *t = task_early_kill(tsk, true);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, pgoff, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
i_mmap_unlock_read(mapping);
}
+#endif /* CONFIG_FS_DAX */
/*
* Collect the processes who have the corrupted page mapped to kill.
@@ -779,12 +819,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
int err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_info("Memory failure: %#lx: failed to release buffers\n",
- pfn);
+ pr_info("%#lx: failed to release buffers\n", pfn);
} else {
ret = MF_RECOVERED;
}
@@ -796,8 +834,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- pr_info("Memory failure: %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("%#lx: Failed to invalidate\n", pfn);
}
return ret;
@@ -827,7 +864,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
count -= 1;
if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ pr_err("%#lx: %s still referenced by %d users\n",
page_to_pfn(p), action_page_types[ps->type], count);
return true;
}
@@ -851,7 +888,7 @@ static int me_kernel(struct page_state *ps, struct page *p)
*/
static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
+ pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -1007,12 +1044,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p)
static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
+ struct folio *folio = page_folio(p);
int ret;
- delete_from_swap_cache(p);
+ delete_from_swap_cache(folio);
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
- unlock_page(p);
+ folio_unlock(folio);
if (has_extra_refcount(ps, p, false))
ret = MF_FAILED;
@@ -1135,7 +1173,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
trace_memory_failure_event(pfn, type, result);
num_poisoned_pages_inc();
- pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+ pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -1210,8 +1248,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
if (head == compound_head(page))
return 1;
- pr_info("Memory failure: %#lx cannot catch tail\n",
- page_to_pfn(page));
+ pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
put_page(head);
}
@@ -1274,7 +1311,7 @@ try_again:
}
out:
if (ret == -EIO)
- pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
+ pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
return ret;
}
@@ -1373,13 +1410,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
return true;
if (PageKsm(p)) {
- pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
+ pr_err("%#lx: can't handle KSM pages.\n", pfn);
return false;
}
if (PageSwapCache(p)) {
- pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
- pfn);
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -1397,7 +1433,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -1426,14 +1462,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
} else
- pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
} else {
try_to_unmap(folio, ttu);
}
unmap_success = !page_mapped(hpage);
if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+ pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
/*
@@ -1498,6 +1534,134 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
return 0;
}
+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+ struct address_space *mapping, pgoff_t index, int flags)
+{
+ struct to_kill *tk;
+ unsigned long size = 0;
+
+ list_for_each_entry(tk, to_kill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up device-dax
+ * mappings which are constant size. The actual size of the
+ * mapping being torn down is communicated in siginfo, see
+ * kill_proc()
+ */
+ loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
+
+ unmap_mapping_range(mapping, start, size, 0);
+ }
+
+ kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
+}
+
+static int mf_generic_kill_procs(unsigned long long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ int rc = 0;
+
+ /*
+ * Pages instantiated by device-dax (not filesystem-dax)
+ * may be compound pages.
+ */
+ page = compound_head(page);
+
+ /*
+ * Prevent the inode from being freed while we are interrogating
+ * the address_space, typically this would be handled by
+ * lock_page(), but dax pages do not use the page lock. This
+ * also prevents changes to the mapping of this pfn until
+ * poison signaling is complete.
+ */
+ cookie = dax_lock_page(page);
+ if (!cookie)
+ return -EBUSY;
+
+ if (hwpoison_filter(page)) {
+ rc = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ /*
+ * TODO: Handle device pages which may need coordination
+ * with device-side memory.
+ */
+ rc = -ENXIO;
+ goto unlock;
+ default:
+ break;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs(page, &to_kill, true);
+
+ unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+unlock:
+ dax_unlock_page(page, cookie);
+ return rc;
+}
+
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping: address_space of the file in use
+ * @index: start pgoff of the range within the file
+ * @count: length of the range, in unit of PAGE_SIZE
+ * @mf_flags: memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags)
+{
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ struct page *page;
+ size_t end = index + count;
+
+ mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+ for (; index < end; index++) {
+ page = NULL;
+ cookie = dax_lock_mapping_entry(mapping, index, &page);
+ if (!cookie)
+ return -EBUSY;
+ if (!page)
+ goto unlock;
+
+ SetPageHWPoison(page);
+
+ collect_procs_fsdax(page, mapping, index, &to_kill);
+ unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+ index, mf_flags);
+unlock:
+ dax_unlock_mapping_entry(mapping, index, cookie);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
/*
* Called from hugetlb code with hugetlb_lock held.
*
@@ -1566,7 +1730,7 @@ retry:
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+ pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
head = compound_head(p);
res = kill_accessing_process(current, page_to_pfn(head), flags);
@@ -1633,23 +1797,20 @@ out:
unlock_page(head);
return res;
}
+
#else
static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
return 0;
}
-#endif
+
+#endif /* CONFIG_HUGETLB_PAGE */
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
struct page *page = pfn_to_page(pfn);
- unsigned long size = 0;
- struct to_kill *tk;
- LIST_HEAD(tokill);
- int rc = -EBUSY;
- loff_t start;
- dax_entry_t cookie;
+ int rc = -ENXIO;
if (flags & MF_COUNT_INCREASED)
/*
@@ -1658,73 +1819,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
put_page(page);
/* device metadata space is not recoverable */
- if (!pgmap_pfn_valid(pgmap, pfn)) {
- rc = -ENXIO;
+ if (!pgmap_pfn_valid(pgmap, pfn))
goto out;
- }
/*
- * Pages instantiated by device-dax (not filesystem-dax)
- * may be compound pages.
+ * Call driver's implementation to handle the memory failure, otherwise
+ * fall back to generic handler.
*/
- page = compound_head(page);
-
- /*
- * Prevent the inode from being freed while we are interrogating
- * the address_space, typically this would be handled by
- * lock_page(), but dax pages do not use the page lock. This
- * also prevents changes to the mapping of this pfn until
- * poison signaling is complete.
- */
- cookie = dax_lock_page(page);
- if (!cookie)
- goto out;
-
- if (hwpoison_filter(page)) {
- rc = -EOPNOTSUPP;
- goto unlock;
- }
-
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ if (pgmap->ops->memory_failure) {
+ rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
- * TODO: Handle HMM pages which may need coordination
- * with device-side memory.
+ * Fall back to generic handler too if operation is not
+ * supported inside the driver/device/filesystem.
*/
- goto unlock;
+ if (rc != -EOPNOTSUPP)
+ goto out;
}
- /*
- * Use this flag as an indication that the dax page has been
- * remapped UC to prevent speculative consumption of poison.
- */
- SetPageHWPoison(page);
-
- /*
- * Unlike System-RAM there is no possibility to swap in a
- * different physical page at a given virtual address, so all
- * userspace consumption of ZONE_DEVICE memory necessitates
- * SIGBUS (i.e. MF_MUST_KILL)
- */
- flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, true);
-
- list_for_each_entry(tk, &tokill, nd)
- if (tk->size_shift)
- size = max(size, 1UL << tk->size_shift);
- if (size) {
- /*
- * Unmap the largest mapping to avoid breaking up
- * device-dax mappings which are constant size. The
- * actual size of the mapping being torn down is
- * communicated in siginfo, see kill_proc()
- */
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, size, 0);
- }
- kill_procs(&tokill, true, false, pfn, flags);
- rc = 0;
-unlock:
- dax_unlock_page(page, cookie);
+ rc = mf_generic_kill_procs(pfn, flags, pgmap);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
@@ -1787,8 +1899,7 @@ int memory_failure(unsigned long pfn, int flags)
goto unlock_mutex;
}
}
- pr_err("Memory failure: %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("%#lx: memory outside kernel control\n", pfn);
res = -ENXIO;
goto unlock_mutex;
}
@@ -1799,8 +1910,7 @@ try_again:
goto unlock_mutex;
if (TestSetPageHWPoison(p)) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n",
- pfn);
+ pr_err("%#lx: already hardware poisoned\n", pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
@@ -2016,7 +2126,7 @@ void memory_failure_queue(unsigned long pfn, int flags)
if (kfifo_put(&mf_cpu->fifo, entry))
schedule_work_on(smp_processor_id(), &mf_cpu->work);
else
- pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pr_err("buffer overflow when queuing memory failure at %#lx\n",
pfn);
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
put_cpu_var(memory_failure_cpu);
@@ -2073,6 +2183,8 @@ static int __init memory_failure_init(void)
}
core_initcall(memory_failure_init);
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
#define unpoison_pr_info(fmt, pfn, rs) \
({ \
if (__ratelimit(rs)) \
@@ -2178,7 +2290,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
bool lru = PageLRU(page);
if (PageHuge(page)) {
- isolated = isolate_huge_page(page, pagelist);
+ isolated = !isolate_hugetlb(page, pagelist);
} else {
if (lru)
isolated = !isolate_lru_page(page);