diff options
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 651 |
1 files changed, 342 insertions, 309 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9349948f1abf..b91a33fb6c69 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -68,6 +68,8 @@ static int sysctl_memory_failure_early_kill __read_mostly; static int sysctl_memory_failure_recovery __read_mostly = 1; +static int sysctl_enable_soft_offline __read_mostly = 1; + atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; @@ -98,7 +100,7 @@ static ssize_t _name##_show(struct device *dev, \ { \ struct memory_failure_stats *mf_stats = \ &NODE_DATA(dev->id)->mf_stats; \ - return sprintf(buf, "%lu\n", mf_stats->_name); \ + return sysfs_emit(buf, "%lu\n", mf_stats->_name); \ } \ static DEVICE_ATTR_RO(_name) @@ -122,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -static struct ctl_table memory_failure_table[] = { +static const struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, @@ -141,7 +143,15 @@ static struct ctl_table memory_failure_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } + { + .procname = "enable_soft_offline", + .data = &sysctl_enable_soft_offline, + .maxlen = sizeof(sysctl_enable_soft_offline), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + } }; /* @@ -154,11 +164,23 @@ static int __page_handle_poison(struct page *page) { int ret; - zone_pcp_disable(page_zone(page)); - ret = dissolve_free_huge_page(page); - if (!ret) + /* + * zone_pcp_disable() can't be used here. It will + * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold + * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap + * optimization is enabled. This will break current lock dependency + * chain and leads to deadlock. + * Disabling pcp before dissolving the page was a deterministic + * approach because we made sure that those pages cannot end up in any + * PCP list. Draining PCP lists expels those pages to the buddy system, + * but nothing guarantees that those pages do not get back to a PCP + * queue if we need to refill those. + */ + ret = dissolve_free_hugetlb_folio(page_folio(page)); + if (!ret) { + drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); - zone_pcp_enable(page_zone(page)); + } return ret; } @@ -167,8 +189,8 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo { if (hugepage_or_freepage) { /* - * Doing this check for free pages is also fine since dissolve_free_huge_page - * returns 0 for non-hugetlb pages as well. + * Doing this check for free pages is also fine since + * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well. */ if (__page_handle_poison(page) <= 0) /* @@ -205,6 +227,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); static int hwpoison_filter_dev(struct page *p) { + struct folio *folio = page_folio(p); struct address_space *mapping; dev_t dev; @@ -212,7 +235,7 @@ static int hwpoison_filter_dev(struct page *p) hwpoison_filter_dev_minor == ~0U) return 0; - mapping = page_mapping(p); + mapping = folio_mapping(folio); if (mapping == NULL || mapping->host == NULL) return -EINVAL; @@ -282,6 +305,7 @@ int hwpoison_filter(struct page *p) return 0; } +EXPORT_SYMBOL_GPL(hwpoison_filter); #else int hwpoison_filter(struct page *p) { @@ -289,8 +313,6 @@ int hwpoison_filter(struct page *p) } #endif -EXPORT_SYMBOL_GPL(hwpoison_filter); - /* * Kill all processes that have a poisoned page mapped and then isolate * the page. @@ -332,7 +354,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) int ret = 0; pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pfn, t->comm, task_pid_nr(t)); if ((flags & MF_ACTION_REQUIRED) && (t == current)) ret = force_sig_mceerr(BUS_MCEERR_AR, @@ -343,14 +365,12 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * PF_MCE_EARLY set. * Don't use force here, it's convenient if the signal * can be temporarily blocked. - * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully no one will do that? */ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); if (ret < 0) pr_info("Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); + t->comm, task_pid_nr(t), ret); return ret; } @@ -358,20 +378,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * Unknown page type encountered. Try to check whether it can turn PageLRU by * lru_add_drain_all. */ -void shake_page(struct page *p) +void shake_folio(struct folio *folio) { - if (PageHuge(p)) + if (folio_test_hugetlb(folio)) return; /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ - if (PageSlab(p)) + if (folio_test_slab(folio)) return; lru_add_drain_all(); } -EXPORT_SYMBOL_GPL(shake_page); +EXPORT_SYMBOL_GPL(shake_folio); + +static void shake_page(struct page *page) +{ + shake_folio(page_folio(page)); +} static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, unsigned long address) @@ -394,18 +419,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; - if (pud_devmap(*pud)) + if (pud_trans_huge(*pud)) return PUD_SHIFT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; - if (pmd_devmap(*pmd)) + if (pmd_trans_huge(*pmd)) return PMD_SHIFT; pte = pte_offset_map(pmd, address); if (!pte) return 0; ptent = ptep_get(pte); - if (pte_present(ptent) && pte_devmap(ptent)) + if (pte_present(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -416,21 +441,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ -#define FSDAX_INVALID_PGOFF ULONG_MAX - /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a - * filesystem with a memory failure handler has claimed the - * memory_failure event. In all other cases, page->index and - * page->mapping are sufficient for mapping the page back to its - * corresponding user virtual address. */ -static void __add_to_kill(struct task_struct *tsk, struct page *p, +static void __add_to_kill(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr, pgoff_t fsdax_pgoff) + unsigned long addr) { struct to_kill *tk; @@ -440,13 +457,11 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, return; } - tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma); - if (is_zone_device_page(p)) { - if (fsdax_pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->addr = addr; + if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - } else - tk->size_shift = page_shift(compound_head(p)); + else + tk->size_shift = folio_shift(page_folio(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -471,11 +486,13 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, list_add_tail(&tk->nd, to_kill); } -static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) +static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p, + struct vm_area_struct *vma, struct list_head *to_kill, + unsigned long addr) { - __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF); + if (addr == -EFAULT) + return; + __add_to_kill(tsk, p, vma, to_kill, addr); } #ifdef CONFIG_KSM @@ -491,12 +508,13 @@ static bool task_in_to_kill_list(struct list_head *to_kill, return false; } -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, + +void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr) + unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, addr); } #endif /* @@ -504,24 +522,17 @@ void add_to_kill_ksm(struct task_struct *tsk, struct page *p, * * Only do anything when FORCEKILL is set, otherwise just free the * list (this is used for clean pages which do not need killing) - * Also when FAIL is set do a force kill because something went - * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, +static void kill_procs(struct list_head *to_kill, int forcekill, unsigned long pfn, int flags) { struct to_kill *tk, *next; list_for_each_entry_safe(tk, next, to_kill, nd) { if (forcekill) { - /* - * In case something went wrong with munmapping - * make sure the process doesn't catch the - * signal and then access the memory. Just kill it. - */ - if (fail || tk->addr == -EFAULT) { + if (tk->addr == -EFAULT) { pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pfn, tk->tsk->comm, task_pid_nr(tk->tsk)); do_send_sig_info(SIGKILL, SEND_SIG_PRIV, tk->tsk, PIDTYPE_PID); } @@ -534,7 +545,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, */ else if (kill_proc(tk, pfn, flags) < 0) pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pfn, tk->tsk->comm, task_pid_nr(tk->tsk)); } list_del(&tk->nd); put_task_struct(tk->tsk); @@ -595,10 +606,10 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) /* * Collect processes when the error hit an anonymous page. */ -static void collect_procs_anon(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static void collect_procs_anon(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { - struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; @@ -607,11 +618,13 @@ static void collect_procs_anon(struct folio *folio, struct page *page, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(folio, page); rcu_read_lock(); for_each_process(tsk) { + struct vm_area_struct *vma; struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -620,9 +633,8 @@ static void collect_procs_anon(struct folio *folio, struct page *page, vma = vmac->vma; if (vma->vm_mm != t->mm) continue; - if (!page_mapped_in_vma(page, vma)) - continue; - add_to_kill_anon_file(t, page, vma, to_kill); + addr = page_mapped_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -632,8 +644,9 @@ static void collect_procs_anon(struct folio *folio, struct page *page, /* * Collect processes when the error hit a file mapped page. */ -static void collect_procs_file(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static void collect_procs_file(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -642,9 +655,10 @@ static void collect_procs_file(struct folio *folio, struct page *page, i_mmap_lock_read(mapping); rcu_read_lock(); - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(folio, page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -657,8 +671,10 @@ static void collect_procs_file(struct folio *folio, struct page *page, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == t->mm) - add_to_kill_anon_file(t, page, vma, to_kill); + if (vma->vm_mm != t->mm) + continue; + addr = page_address_in_vma(folio, page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -666,17 +682,18 @@ static void collect_procs_file(struct folio *folio, struct page *page, } #ifdef CONFIG_FS_DAX -static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, +static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { - __add_to_kill(tsk, p, vma, to_kill, 0, pgoff); + unsigned long addr = vma_address(vma, pgoff, 1); + __add_to_kill(tsk, p, vma, to_kill, addr); } /* * Collect processes when the error hit a fsdax page. */ -static void collect_procs_fsdax(struct page *page, +static void collect_procs_fsdax(const struct page *page, struct address_space *mapping, pgoff_t pgoff, struct list_head *to_kill, bool pre_remove) { @@ -710,14 +727,14 @@ static void collect_procs_fsdax(struct page *page, /* * Collect the processes who have the corrupted page mapped to kill. */ -static void collect_procs(struct folio *folio, struct page *page, +static void collect_procs(const struct folio *folio, const struct page *page, struct list_head *tokill, int force_early) { if (!folio->mapping) return; - if (unlikely(PageKsm(page))) - collect_procs_ksm(page, tokill, force_early); - else if (PageAnon(page)) + if (unlikely(folio_test_ksm(folio))) + collect_procs_ksm(folio, page, tokill, force_early); + else if (folio_test_anon(folio)) collect_procs_anon(folio, page, tokill, force_early); else collect_procs_file(folio, page, tokill, force_early); @@ -820,7 +837,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, struct mm_walk *walk) { struct hwpoison_walk *hwp = walk->private; - pte_t pte = huge_ptep_get(ptep); + pte_t pte = huge_ptep_get(walk->mm, addr, ptep); struct hstate *h = hstate_vma(walk->vma); return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), @@ -864,14 +881,41 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, mmap_read_lock(p->mm); ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, (void *)&priv); + /* + * ret = 1 when CMCI wins, regardless of whether try_to_unmap() + * succeeds or fails, then kill the process with SIGBUS. + * ret = 0 when poison page is a clean page and it's dropped, no + * SIGBUS is needed. + */ if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); - else - ret = 0; mmap_read_unlock(p->mm); - return ret > 0 ? -EHWPOISON : -EFAULT; + + return ret > 0 ? -EHWPOISON : 0; } +/* + * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * But it could not do more to isolate the page from being accessed again, + * nor does it kill the process. This is extremely rare and one of the + * potential causes is that the page state has been changed due to + * underlying race condition. This is the most severe outcomes. + * + * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * It should have killed the process, but it can't isolate the page, + * due to conditions such as extra pin, unmap failure, etc. Accessing + * the page again may trigger another MCE and the process will be killed + * by the m-f() handler immediately. + * + * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * The page is unmapped, and is removed from the LRU or file mapping. + * An attempt to access the page again will trigger page fault and the + * PF handler will kill the process. + * + * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed. + * The page has been completely isolated, that is, unmapped, taken out of + * the buddy system, or hole-punnched out of the file mapping. + */ static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -882,10 +926,9 @@ static const char *action_name[] = { static const char * const action_page_types[] = { [MF_MSG_KERNEL] = "reserved kernel page", [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", - [MF_MSG_SLAB] = "kernel slab page", - [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_GET_HWPOISON] = "get hwpoison page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -899,6 +942,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", + [MF_MSG_ALREADY_POISONED] = "already poisoned", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1006,12 +1050,13 @@ static int me_kernel(struct page_state *ps, struct page *p) /* * Page in unknown state. Do nothing. + * This is a catch-all in case we fail to make sense of the page state. */ static int me_unknown(struct page_state *ps, struct page *p) { pr_err("%#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); - return MF_FAILED; + return MF_IGNORED; } /* @@ -1077,9 +1122,9 @@ out: */ static int me_pagecache_dirty(struct page_state *ps, struct page *p) { - struct address_space *mapping = page_mapping(p); + struct folio *folio = page_folio(p); + struct address_space *mapping = folio_mapping(folio); - SetPageError(p); /* TBD: print more information about the file. */ if (mapping) { /* @@ -1087,34 +1132,6 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p) * who check the mapping. * This way the application knows that something went * wrong with its dirty file data. - * - * There's one open issue: - * - * The EIO will be only reported on the next IO - * operation and then cleared through the IO map. - * Normally Linux has two mechanisms to pass IO error - * first through the AS_EIO flag in the address space - * and then through the PageError flag in the page. - * Since we drop pages on memory failure handling the - * only mechanism open to use is through AS_AIO. - * - * This has the disadvantage that it gets cleared on - * the first operation that returns an error, while - * the PageError bit is more sticky and only cleared - * when the page is reread or dropped. If an - * application assumes it will always get error on - * fsync, but does other operations on the fd before - * and the page is dropped between then the error - * will not be properly reported. - * - * This can already happen even without hwpoisoned - * pages: first on metadata IO errors (which only - * report through AS_EIO) or when the page is dropped - * at the wrong time. - * - * So right now we assume that the application DTRT on - * the first EIO, but we're not worse than other parts - * of the kernel. */ mapping_set_error(mapping, -EIO); } @@ -1126,7 +1143,7 @@ static int me_pagecache_dirty(struct page_state *ps, struct page *p) * Clean and dirty swap cache. * * Dirty swap cache page is tricky to handle. The page could live both in page - * cache and swap cache(ie. page is freshly swapped in). So it could be + * table and swap cache(ie. page is freshly swapped in). So it could be * referenced concurrently by 2 types of PTEs: * normal PTEs and swap PTEs. We try to handle them consistently by calling * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs, @@ -1206,7 +1223,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) * subpages. */ folio_put(folio); - if (__page_handle_poison(p) >= 0) { + if (__page_handle_poison(p) > 0) { page_ref_inc(p); res = MF_RECOVERED; } else { @@ -1239,7 +1256,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) #define mlock (1UL << PG_mlocked) #define lru (1UL << PG_lru) #define head (1UL << PG_head) -#define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state error_states[] = { @@ -1249,13 +1265,6 @@ static struct page_state error_states[] = { * PG_buddy pages only make a small fraction of all free pages. */ - /* - * Could in theory check if slab page is free or if we can drop - * currently unused objects without touching them. But just - * treat it as standard kernel for now. - */ - { slab, slab, MF_MSG_SLAB, me_kernel }, - { head, head, MF_MSG_HUGE, me_huge_page }, { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, @@ -1282,7 +1291,6 @@ static struct page_state error_states[] = { #undef mlock #undef lru #undef head -#undef slab #undef reserved static void update_per_node_mf_stats(unsigned long pfn, @@ -1423,6 +1431,8 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) return 0; } +#define GET_PAGE_MAX_RETRY_NUM 3 + static int get_any_page(struct page *p, unsigned long flags) { int ret = 0, pass = 0; @@ -1437,12 +1447,12 @@ try_again: if (!ret) { if (page_count(p)) { /* We raced with an allocation, retry. */ - if (pass++ < 3) + if (pass++ < GET_PAGE_MAX_RETRY_NUM) goto try_again; ret = -EBUSY; } else if (!PageHuge(p) && !is_free_buddy_page(p)) { /* We raced with put_page, retry. */ - if (pass++ < 3) + if (pass++ < GET_PAGE_MAX_RETRY_NUM) goto try_again; ret = -EIO; } @@ -1468,7 +1478,7 @@ try_again: * A page we cannot handle. Check whether we can turn * it into something we can handle. */ - if (pass++ < 3) { + if (pass++ < GET_PAGE_MAX_RETRY_NUM) { put_page(p); shake_page(p); count_increased = false; @@ -1530,7 +1540,7 @@ static int __get_unpoison_page(struct page *page) * the given page has PG_hwpoison. So it's never reused for other page * allocations, and __get_unpoison_page() never races with them. * - * Return: 0 on failure, + * Return: 0 on failure or free buddy (hugetlb) page, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle @@ -1551,38 +1561,12 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } -/* - * Do all that is necessary to remove user space mappings. Unmap - * the pages and send SIGBUS to the processes if the data was dirty. - */ -static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page *hpage) +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { - struct folio *folio = page_folio(hpage); enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; struct address_space *mapping; - LIST_HEAD(tokill); - bool unmap_success; - int forcekill; - bool mlocked = PageMlocked(hpage); - - /* - * Here we are interested only in user-mapped pages, so skip any - * other types of pages. - */ - if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) - return true; - if (!(PageLRU(hpage) || PageHuge(p))) - return true; - /* - * This check implies we don't kill processes if their pages - * are in the swap cache early. Those are always late kills. - */ - if (!page_mapped(p)) - return true; - - if (PageSwapCache(p)) { + if (folio_test_swapcache(folio)) { pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); ttu &= ~TTU_HWPOISON; } @@ -1593,11 +1577,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(hpage); - if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping = folio_mapping(folio); + if (!must_kill && !folio_test_dirty(folio) && mapping && mapping_can_writeback(mapping)) { - if (page_mkclean(hpage)) { - SetPageDirty(hpage); + if (folio_mkclean(folio)) { + folio_set_dirty(folio); } else { ttu &= ~TTU_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", @@ -1605,42 +1589,77 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, } } - /* - * First collect all the processes that have the page - * mapped in dirty form. This has to be done before try_to_unmap, - * because ttu takes the rmap data structures down. - */ - collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - - if (PageHuge(hpage) && !PageAnon(hpage)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* - * For hugetlb pages in shared mappings, try_to_unmap + * For hugetlb folios in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of * this, take semaphore in write mode here and set * TTU_RMAP_LOCKED to indicate we have taken the lock * at this higher level. */ - mapping = hugetlb_page_mapping_lock_write(hpage); - if (mapping) { - try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); + mapping = hugetlb_folio_mapping_lock_write(folio); + if (!mapping) { + pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", + folio_pfn(folio)); + return -EBUSY; + } + + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); } else { try_to_unmap(folio, ttu); } - unmap_success = !page_mapped(p); + return folio_mapped(folio) ? -EBUSY : 0; +} + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static bool hwpoison_user_mappings(struct folio *folio, struct page *p, + unsigned long pfn, int flags) +{ + LIST_HEAD(tokill); + bool unmap_success; + int forcekill; + bool mlocked = folio_test_mlocked(folio); + + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ + if (folio_test_reserved(folio) || folio_test_slab(folio) || + folio_test_pgtable(folio) || folio_test_offline(folio)) + return true; + if (!(folio_test_lru(folio) || folio_test_hugetlb(folio))) + return true; + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!folio_mapped(folio)) + return true; + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + */ + collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); + + unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL); if (!unmap_success) - pr_err("%#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", + pfn, folio_mapcount(folio)); /* * try_to_unmap() might put mlocked page in lru cache, so call * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage); + shake_folio(folio); /* * Now that the dirty bit has been propagated to the @@ -1652,9 +1671,9 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) || !unmap_success; - kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); + kill_procs(&tokill, forcekill, pfn, flags); return unmap_success; } @@ -1682,7 +1701,12 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } -static int try_to_split_thp_page(struct page *page) +/* + * When 'release' is 'false', it means that if thp split has failed, + * there is still more to do, hence the page refcount we took earlier + * is still needed. + */ +static int try_to_split_thp_page(struct page *page, bool release) { int ret; @@ -1690,7 +1714,7 @@ static int try_to_split_thp_page(struct page *page) ret = split_huge_page(page); unlock_page(page); - if (unlikely(ret)) + if (ret && release) put_page(page); return ret; @@ -1718,7 +1742,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, unmap_mapping_range(mapping, start, size, 0); } - kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); + kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags); } /* @@ -1906,7 +1930,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; - struct raw_hwp_page *p, *next; + struct raw_hwp_page *p; int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* @@ -1917,7 +1941,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; head = raw_hwp_list_head(folio); - llist_for_each_entry_safe(p, next, head->first, node) { + llist_for_each_entry(p, head->first, node) { if (p->page == page) return -EHWPOISON; } @@ -2056,6 +2080,7 @@ retry: if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } return res; } else if (res == -EBUSY) { @@ -2063,7 +2088,7 @@ retry: flags |= MF_NO_RETRY; goto retry; } - return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); } folio = page_folio(p); @@ -2085,7 +2110,7 @@ retry: */ if (res == 0) { folio_unlock(folio); - if (__page_handle_poison(p) >= 0) { + if (__page_handle_poison(p) > 0) { page_ref_inc(p); res = MF_RECOVERED; } else { @@ -2096,9 +2121,9 @@ retry: page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); - return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED); } return identify_page_state(pfn, p, page_flags); @@ -2119,14 +2144,10 @@ static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) /* Drop the extra refcount in case we come from madvise() */ static void put_ref_page(unsigned long pfn, int flags) { - struct page *page; - if (!(flags & MF_COUNT_INCREASED)) return; - page = pfn_to_page(pfn); - if (page) - put_page(page); + put_page(pfn_to_page(pfn)); } static int memory_failure_dev_pagemap(unsigned long pfn, int flags, @@ -2161,6 +2182,22 @@ out: return rc; } +/* + * The calling condition is as such: thp split failed, page might have + * been RDMA pinned, not much can be done for recovery. + * But a SIGBUS should be delivered with vaddr provided so that the user + * application has a chance to recover. Also, application processes' + * election for MCE early killed will be honored. + */ +static void kill_procs_now(struct page *p, unsigned long pfn, int flags, + struct folio *folio) +{ + LIST_HEAD(tokill); + + collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); + kill_procs(&tokill, true, pfn, flags); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2178,14 +2215,18 @@ out: * Must run in process context (e.g. a work queue) with interrupts * enabled and no spinlocks held. * - * Return: 0 for successfully handled the memory error, - * -EOPNOTSUPP for hwpoison_filter() filtered the error event, - * < 0(except -EOPNOTSUPP) on failure. + * Return: + * 0 - success, + * -ENXIO - memory not managed by the kernel + * -EOPNOTSUPP - hwpoison_filter() filtered the error event, + * -EHWPOISON - the page was already poisoned, potentially + * kill process, + * other negative values - failure. */ int memory_failure(unsigned long pfn, int flags) { struct page *p; - struct page *hpage; + struct folio *folio; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; @@ -2232,6 +2273,7 @@ try_again: res = kill_accessing_process(current, pfn, flags); if (flags & MF_COUNT_INCREASED) put_page(p); + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); goto unlock_mutex; } @@ -2268,13 +2310,25 @@ try_again: } goto unlock_mutex; } else if (res < 0) { - res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); goto unlock_mutex; } } - hpage = compound_head(p); - if (PageTransHuge(hpage)) { + folio = page_folio(p); + + /* filter pages that are protected from hwpoison test by users */ + folio_lock(folio); + if (hwpoison_filter(p)) { + ClearPageHWPoison(p); + folio_unlock(folio); + folio_put(folio); + res = -EOPNOTSUPP; + goto unlock_mutex; + } + folio_unlock(folio); + + if (folio_test_large(folio)) { /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2288,12 +2342,16 @@ try_again: * or unhandlable page. The refcount is bumped iff the * page is a valid handlable page. */ - SetPageHasHWPoisoned(hpage); - if (try_to_split_thp_page(p) < 0) { - res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + folio_set_has_hwpoisoned(folio); + if (try_to_split_thp_page(p, false) < 0) { + res = -EHWPOISON; + kill_procs_now(p, pfn, flags, folio); + put_page(p); + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_FAILED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); + folio = page_folio(p); } /* @@ -2304,28 +2362,16 @@ try_again: * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p); + shake_folio(folio); - lock_page(p); + folio_lock(folio); /* * We're only intended to deal with the non-Compound page here. - * However, the page could have changed compound pages due to - * race window. If this happens, we could try again to hopefully - * handle the page next round. + * The page cannot become compound pages again as folio has been + * splited and extra refcnt is held. */ - if (PageCompound(p)) { - if (retry) { - ClearPageHWPoison(p); - unlock_page(p); - put_page(p); - flags &= ~MF_COUNT_INCREASED; - retry = false; - goto try_again; - } - res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); - goto unlock_page; - } + WARN_ON(folio_test_large(folio)); /* * We use page flags to determine what action should be taken, but @@ -2334,43 +2380,36 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; - - if (hwpoison_filter(p)) { - ClearPageHWPoison(p); - unlock_page(p); - put_page(p); - res = -EOPNOTSUPP; - goto unlock_mutex; - } + page_flags = folio->flags; /* - * __munlock_folio() may clear a writeback page's LRU flag without - * page_lock. We need wait writeback completion for this page or it - * may trigger vfs BUG while evict inode. + * __munlock_folio() may clear a writeback folio's LRU flag without + * the folio lock. We need to wait for writeback completion for this + * folio or it may trigger a vfs BUG while evicting inode. */ - if (!PageLRU(p) && !PageWriteback(p)) + if (!folio_test_lru(folio) && !folio_test_writeback(folio)) goto identify_page_state; /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - wait_on_page_writeback(p); + folio_wait_writeback(folio); /* * Now take care of user space mappings. * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, p)) { - res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { + res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_FAILED); goto unlock_page; } /* * Torn down by someone else? */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + if (folio_test_lru(folio) && !folio_test_swapcache(folio) && + folio->mapping == NULL) { res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } @@ -2380,7 +2419,7 @@ identify_page_state: mutex_unlock(&mf_mutex); return res; unlock_page: - unlock_page(p); + folio_unlock(folio); unlock_mutex: mutex_unlock(&mf_mutex); return res; @@ -2398,7 +2437,7 @@ struct memory_failure_entry { struct memory_failure_cpu { DECLARE_KFIFO(fifo, struct memory_failure_entry, MEMORY_FAILURE_FIFO_SIZE); - spinlock_t lock; + raw_spinlock_t lock; struct work_struct work; }; @@ -2424,20 +2463,22 @@ void memory_failure_queue(unsigned long pfn, int flags) { struct memory_failure_cpu *mf_cpu; unsigned long proc_flags; + bool buffer_overflow; struct memory_failure_entry entry = { .pfn = pfn, .flags = flags, }; mf_cpu = &get_cpu_var(memory_failure_cpu); - spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, entry)) + raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); + buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry); + if (!buffer_overflow) schedule_work_on(smp_processor_id(), &mf_cpu->work); - else + raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); + if (buffer_overflow) pr_err("buffer overflow when queuing memory failure at %#lx\n", pfn); - spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); - put_cpu_var(memory_failure_cpu); } EXPORT_SYMBOL_GPL(memory_failure_queue); @@ -2450,9 +2491,9 @@ static void memory_failure_work_func(struct work_struct *work) mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { - spin_lock_irqsave(&mf_cpu->lock, proc_flags); + raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); - spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) @@ -2482,7 +2523,7 @@ static int __init memory_failure_init(void) for_each_possible_cpu(cpu) { mf_cpu = &per_cpu(memory_failure_cpu, cpu); - spin_lock_init(&mf_cpu->lock); + raw_spin_lock_init(&mf_cpu->lock); INIT_KFIFO(mf_cpu->fifo); INIT_WORK(&mf_cpu->work, memory_failure_work_func); } @@ -2494,7 +2535,7 @@ static int __init memory_failure_init(void) core_initcall(memory_failure_init); #undef pr_fmt -#define pr_fmt(fmt) "" fmt +#define pr_fmt(fmt) "Unpoison: " fmt #define unpoison_pr_info(fmt, pfn, rs) \ ({ \ if (__ratelimit(rs)) \ @@ -2518,7 +2559,7 @@ int unpoison_memory(unsigned long pfn) struct folio *folio; struct page *p; int ret = -EBUSY, ghp; - unsigned long count = 1; + unsigned long count; bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2532,47 +2573,50 @@ int unpoison_memory(unsigned long pfn) mutex_lock(&mf_mutex); if (hw_memory_failure) { - unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n", + unpoison_pr_info("%#lx: disabled after HW memory failure\n", + pfn, &unpoison_rs); + ret = -EOPNOTSUPP; + goto unlock_mutex; + } + + if (is_huge_zero_folio(folio)) { + unpoison_pr_info("%#lx: huge zero page is not supported\n", pfn, &unpoison_rs); ret = -EOPNOTSUPP; goto unlock_mutex; } if (!PageHWPoison(p)) { - unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", + unpoison_pr_info("%#lx: page was already unpoisoned\n", pfn, &unpoison_rs); goto unlock_mutex; } if (folio_ref_count(folio) > 1) { - unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", + unpoison_pr_info("%#lx: someone grabs the hwpoison page\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || - folio_test_reserved(folio) || PageOffline(&folio->page)) + if (folio_test_slab(folio) || folio_test_pgtable(folio) || + folio_test_reserved(folio) || folio_test_offline(folio)) goto unlock_mutex; - /* - * Note that folio->_mapcount is overloaded in SLAB, so the simple test - * in folio_mapped() has to be done after folio_test_slab() is checked. - */ if (folio_mapped(folio)) { - unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", + unpoison_pr_info("%#lx: someone maps the hwpoison page\n", pfn, &unpoison_rs); goto unlock_mutex; } if (folio_mapping(folio)) { - unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", + unpoison_pr_info("%#lx: the hwpoison page has non-NULL mapping\n", pfn, &unpoison_rs); goto unlock_mutex; } ghp = get_hwpoison_page(p, MF_UNPOISON); if (!ghp) { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) @@ -2584,11 +2628,11 @@ int unpoison_memory(unsigned long pfn) ret = put_page_back_buddy(p) ? 0 : -EBUSY; } else { ret = ghp; - unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + unpoison_pr_info("%#lx: failed to grab page\n", pfn, &unpoison_rs); } } else { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) { @@ -2609,46 +2653,15 @@ unlock_mutex: if (!ret) { if (!huge) num_poisoned_pages_sub(pfn, 1); - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + unpoison_pr_info("%#lx: software-unpoisoned page\n", page_to_pfn(p), &unpoison_rs); } return ret; } EXPORT_SYMBOL(unpoison_memory); -static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist) -{ - bool isolated = false; - - if (folio_test_hugetlb(folio)) { - isolated = isolate_hugetlb(folio, pagelist); - } else { - bool lru = !__folio_test_movable(folio); - - if (lru) - isolated = folio_isolate_lru(folio); - else - isolated = isolate_movable_page(&folio->page, - ISOLATE_UNEVICTABLE); - - if (isolated) { - list_add(&folio->lru, pagelist); - if (lru) - node_stat_add_folio(folio, NR_ISOLATED_ANON + - folio_is_file_lru(folio)); - } - } - - /* - * If we succeed to isolate the folio, we grabbed another refcount on - * the folio, so we can safely drop the one we got from get_any_page(). - * If we failed to isolate the folio, it means that we cannot go further - * and we will return an error, so drop the reference we got from - * get_any_page() as well. - */ - folio_put(folio); - return isolated; -} +#undef pr_fmt +#define pr_fmt(fmt) "Soft offline: " fmt /* * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. @@ -2662,15 +2675,17 @@ static int soft_offline_in_use_page(struct page *page) struct folio *folio = page_folio(page); char const *msg_page[] = {"page", "hugepage"}; bool huge = folio_test_hugetlb(folio); + bool isolated; LIST_HEAD(pagelist); struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_FAILURE, }; if (!huge && folio_test_large(folio)) { - if (try_to_split_thp_page(page)) { - pr_info("soft offline: %#lx: thp split failed\n", pfn); + if (try_to_split_thp_page(page, true)) { + pr_info("%#lx: thp split failed\n", pfn); return -EBUSY; } folio = page_folio(page); @@ -2682,7 +2697,7 @@ static int soft_offline_in_use_page(struct page *page) if (PageHWPoison(page)) { folio_unlock(folio); folio_put(folio); - pr_info("soft offline: %#lx page already poisoned\n", pfn); + pr_info("%#lx: page already poisoned\n", pfn); return 0; } @@ -2695,12 +2710,23 @@ static int soft_offline_in_use_page(struct page *page) folio_unlock(folio); if (ret) { - pr_info("soft_offline: %#lx: invalidated\n", pfn); + pr_info("%#lx: invalidated\n", pfn); page_handle_poison(page, false, true); return 0; } - if (mf_isolate_folio(folio, &pagelist)) { + isolated = isolate_folio_to_list(folio, &pagelist); + + /* + * If we succeed to isolate the folio, we grabbed another refcount on + * the folio, so we can safely drop the one we got from get_any_page(). + * If we failed to isolate the folio, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_page() as well. + */ + folio_put(folio); + + if (isolated) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { @@ -2712,13 +2738,13 @@ static int soft_offline_in_use_page(struct page *page) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", + pr_info("%#lx: %s migration failed %ld, type %pGp\n", pfn, msg_page[huge], ret, &page->flags); if (ret > 0) ret = -EBUSY; } } else { - pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n", + pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", pfn, msg_page[huge], page_count(page), &page->flags); ret = -EBUSY; } @@ -2730,8 +2756,9 @@ static int soft_offline_in_use_page(struct page *page) * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * - * Returns 0 on success - * -EOPNOTSUPP for hwpoison_filter() filtered the error event + * Returns 0 on success, + * -EOPNOTSUPP for hwpoison_filter() filtered the error event, or + * disabled by /proc/sys/vm/enable_soft_offline, * < 0 otherwise negated errno. * * Soft offline a page, by migration or invalidation, @@ -2767,10 +2794,16 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + if (!sysctl_enable_soft_offline) { + pr_info_once("disabled by /proc/sys/vm/enable_soft_offline\n"); + put_ref_page(pfn, flags); + return -EOPNOTSUPP; + } + mutex_lock(&mf_mutex); if (PageHWPoison(page)) { - pr_info("%s: %#lx page already poisoned\n", __func__, pfn); + pr_info("%#lx: page already poisoned\n", pfn); put_ref_page(pfn, flags); mutex_unlock(&mf_mutex); return 0; |