diff options
Diffstat (limited to 'mm/memory-failure.c')
| -rw-r--r-- | mm/memory-failure.c | 465 |
1 files changed, 288 insertions, 177 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a7b8ccd29b6f..fbc5a01260c8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/memory-failure.h> #include <linux/page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> @@ -50,7 +51,7 @@ #include <linux/backing-dev.h> #include <linux/migrate.h> #include <linux/slab.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/hugetlb.h> #include <linux/memory_hotplug.h> #include <linux/mm_inline.h> @@ -60,9 +61,12 @@ #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/sysctl.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/memory-failure.h> + #include "swap.h" #include "internal.h" -#include "ras/ras_event.h" static int sysctl_memory_failure_early_kill __read_mostly; @@ -124,7 +128,7 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -static struct ctl_table memory_failure_table[] = { +static const struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, @@ -154,6 +158,10 @@ static struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -212,106 +220,34 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo return true; } -#if IS_ENABLED(CONFIG_HWPOISON_INJECT) +static hwpoison_filter_func_t __rcu *hwpoison_filter_func __read_mostly; -u32 hwpoison_filter_enable = 0; -u32 hwpoison_filter_dev_major = ~0U; -u32 hwpoison_filter_dev_minor = ~0U; -u64 hwpoison_filter_flags_mask; -u64 hwpoison_filter_flags_value; -EXPORT_SYMBOL_GPL(hwpoison_filter_enable); -EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); -EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); -EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); -EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); - -static int hwpoison_filter_dev(struct page *p) +void hwpoison_filter_register(hwpoison_filter_func_t *filter) { - struct folio *folio = page_folio(p); - struct address_space *mapping; - dev_t dev; - - if (hwpoison_filter_dev_major == ~0U && - hwpoison_filter_dev_minor == ~0U) - return 0; - - mapping = folio_mapping(folio); - if (mapping == NULL || mapping->host == NULL) - return -EINVAL; - - dev = mapping->host->i_sb->s_dev; - if (hwpoison_filter_dev_major != ~0U && - hwpoison_filter_dev_major != MAJOR(dev)) - return -EINVAL; - if (hwpoison_filter_dev_minor != ~0U && - hwpoison_filter_dev_minor != MINOR(dev)) - return -EINVAL; - - return 0; + rcu_assign_pointer(hwpoison_filter_func, filter); } +EXPORT_SYMBOL_GPL(hwpoison_filter_register); -static int hwpoison_filter_flags(struct page *p) +void hwpoison_filter_unregister(void) { - if (!hwpoison_filter_flags_mask) - return 0; - - if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == - hwpoison_filter_flags_value) - return 0; - else - return -EINVAL; + RCU_INIT_POINTER(hwpoison_filter_func, NULL); + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(hwpoison_filter_unregister); -/* - * This allows stress tests to limit test scope to a collection of tasks - * by putting them under some memcg. This prevents killing unrelated/important - * processes such as /sbin/init. Note that the target task may share clean - * pages with init (eg. libc text), which is harmless. If the target task - * share _dirty_ pages with another task B, the test scheme must make sure B - * is also included in the memcg. At last, due to race conditions this filter - * can only guarantee that the page either belongs to the memcg tasks, or is - * a freed page. - */ -#ifdef CONFIG_MEMCG -u64 hwpoison_filter_memcg; -EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); -static int hwpoison_filter_task(struct page *p) +static int hwpoison_filter(struct page *p) { - if (!hwpoison_filter_memcg) - return 0; - - if (page_cgroup_ino(p) != hwpoison_filter_memcg) - return -EINVAL; - - return 0; -} -#else -static int hwpoison_filter_task(struct page *p) { return 0; } -#endif - -int hwpoison_filter(struct page *p) -{ - if (!hwpoison_filter_enable) - return 0; - - if (hwpoison_filter_dev(p)) - return -EINVAL; - - if (hwpoison_filter_flags(p)) - return -EINVAL; + int ret = 0; + hwpoison_filter_func_t *filter; - if (hwpoison_filter_task(p)) - return -EINVAL; + rcu_read_lock(); + filter = rcu_dereference(hwpoison_filter_func); + if (filter) + ret = filter(p); + rcu_read_unlock(); - return 0; -} -EXPORT_SYMBOL_GPL(hwpoison_filter); -#else -int hwpoison_filter(struct page *p) -{ - return 0; + return ret; } -#endif /* * Kill all processes that have a poisoned page mapped and then isolate @@ -419,18 +355,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; - if (pud_devmap(*pud)) + if (pud_trans_huge(*pud)) return PUD_SHIFT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; - if (pmd_devmap(*pmd)) + if (pmd_trans_huge(*pmd)) return PMD_SHIFT; pte = pte_offset_map(pmd, address); if (!pte) return 0; ptent = ptep_get(pte); - if (pte_present(ptent) && pte_devmap(ptent)) + if (pte_present(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -760,10 +696,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, if (pte_present(pte)) { pfn = pte_pfn(pte); } else { - swp_entry_t swp = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if (is_hwpoison_entry(swp)) - pfn = swp_offset_pfn(swp); + if (softleaf_is_hwpoison(entry)) + pfn = softleaf_to_pfn(entry); } if (!pfn || pfn != poisoned_pfn) @@ -837,19 +773,33 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, struct mm_walk *walk) { struct hwpoison_walk *hwp = walk->private; - pte_t pte = huge_ptep_get(walk->mm, addr, ptep); struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t pte; + int ret; - return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), - hwp->pfn, &hwp->tk); + ptl = huge_pte_lock(h, walk->mm, ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); + ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); + spin_unlock(ptl); + return ret; } #else #define hwpoison_hugetlb_range NULL #endif +static int hwpoison_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + /* We also want to consider pages mapped into VM_PFNMAP. */ + return 0; +} + static const struct mm_walk_ops hwpoison_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, + .test_walk = hwpoison_test_walk, .walk_lock = PGWALK_RDLOCK, }; @@ -881,12 +831,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, mmap_read_lock(p->mm); ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, (void *)&priv); + /* + * ret = 1 when CMCI wins, regardless of whether try_to_unmap() + * succeeds or fails, then kill the process with SIGBUS. + * ret = 0 when poison page is a clean page and it's dropped, no + * SIGBUS is needed. + */ if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); - else - ret = 0; mmap_read_unlock(p->mm); - return ret > 0 ? -EHWPOISON : -EFAULT; + + return ret > 0 ? -EHWPOISON : 0; } /* @@ -937,7 +892,8 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1180,7 +1136,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) struct folio *folio = page_folio(p); int ret; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED; folio_unlock(folio); @@ -1330,9 +1286,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1383,8 +1340,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) if (PageSlab(page)) return false; - /* Soft offline could migrate non-LRU movable pages */ - if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) + /* Soft offline could migrate movable_ops pages */ + if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page)) return true; return PageLRU(page) || is_free_buddy_page(page); @@ -1556,11 +1513,39 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +/* + * The caller must guarantee the folio isn't large folio, except hugetlb. + * try_to_unmap() can't handle it. + */ +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - struct address_space *mapping; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; + struct address_space *mapping; + + if (folio_test_swapcache(folio)) { + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); + ttu &= ~TTU_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = folio_mapping(folio); + if (!must_kill && !folio_test_dirty(folio) && mapping && + mapping_can_writeback(mapping)) { + if (folio_mkclean(folio)) { + folio_set_dirty(folio); + } else { + ttu &= ~TTU_HWPOISON; + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb folios in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1572,7 +1557,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) if (!mapping) { pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", folio_pfn(folio)); - return; + return -EBUSY; } try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); @@ -1580,6 +1565,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) } else { try_to_unmap(folio, ttu); } + + return folio_mapped(folio) ? -EBUSY : 0; } /* @@ -1589,8 +1576,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) static bool hwpoison_user_mappings(struct folio *folio, struct page *p, unsigned long pfn, int flags) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; - struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; @@ -1613,29 +1598,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, if (!folio_mapped(folio)) return true; - if (folio_test_swapcache(folio)) { - pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); - ttu &= ~TTU_HWPOISON; - } - - /* - * Propagate the dirty bit from PTEs to struct page first, because we - * need this to decide if we should kill or just drop the page. - * XXX: the dirty test could be racy: set_page_dirty() may not always - * be called inside page lock (it's recommended but not enforced). - */ - mapping = folio_mapping(folio); - if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && - mapping_can_writeback(mapping)) { - if (folio_mkclean(folio)) { - folio_set_dirty(folio); - } else { - ttu &= ~TTU_HWPOISON; - pr_info("%#lx: corrupted page was clean: dropped without side effects\n", - pfn); - } - } - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, @@ -1643,9 +1605,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - unmap_poisoned_folio(folio, ttu); - - unmap_success = !folio_mapped(folio); + unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL); if (!unmap_success) pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", pfn, folio_mapcount(folio)); @@ -1685,10 +1645,10 @@ static int identify_page_state(unsigned long pfn, struct page *p, * carried out only if the first check can't determine the page status. */ for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) + if ((p->flags.f & ps->mask) == ps->res) break; - page_flags |= (p->flags & (1UL << PG_dirty)); + page_flags |= (p->flags.f & (1UL << PG_dirty)); if (!ps->mask) for (ps = error_states;; ps++) @@ -1702,12 +1662,13 @@ static int identify_page_state(unsigned long pfn, struct page *p, * there is still more to do, hence the page refcount we took earlier * is still needed. */ -static int try_to_split_thp_page(struct page *page, bool release) +static int try_to_split_thp_page(struct page *page, unsigned int new_order, + bool release) { int ret; lock_page(page); - ret = split_huge_page(page); + ret = split_huge_page_to_order(page, new_order); unlock_page(page); if (ret && release) @@ -2072,12 +2033,11 @@ retry: *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2115,7 +2075,7 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = folio->flags; + page_flags = folio->flags.f; if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); @@ -2190,8 +2150,140 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, { LIST_HEAD(tokill); + folio_lock(folio); collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); + folio_unlock(folio); + + kill_procs(&tokill, true, pfn, flags); +} + +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pfn(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + unsigned long pfn) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pfn, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pfn, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct address_space *mapping, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { + if (vma->vm_mm == t->mm) + add_to_kill_pfn(t, vma, to_kill, pfn); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); } /** @@ -2211,9 +2303,13 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, * Must run in process context (e.g. a work queue) with interrupts * enabled and no spinlocks held. * - * Return: 0 for successfully handled the memory error, - * -EOPNOTSUPP for hwpoison_filter() filtered the error event, - * < 0(except -EOPNOTSUPP) on failure. + * Return: + * 0 - success, + * -ENXIO - memory not managed by the kernel + * -EOPNOTSUPP - hwpoison_filter() filtered the error event, + * -EHWPOISON - the page was already poisoned, potentially + * kill process, + * other negative values - failure. */ int memory_failure(unsigned long pfn, int flags) { @@ -2239,8 +2335,16 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, @@ -2259,7 +2363,6 @@ try_again: goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); @@ -2321,6 +2424,9 @@ try_again: folio_unlock(folio); if (folio_test_large(folio)) { + const int new_order = min_order_for_split(folio); + int err; + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2335,7 +2441,16 @@ try_again: * page is a valid handlable page. */ folio_set_has_hwpoisoned(folio); - if (try_to_split_thp_page(p, false) < 0) { + err = try_to_split_thp_page(p, new_order, /* release= */ false); + /* + * If splitting a folio to order-0 fails, kill the process. + * Split the folio regardless to minimize unusable pages. + * Because the memory failure code cannot handle large + * folios, this split is always treated as if it failed. + */ + if (err || new_order) { + /* get folio again in case the original one is split */ + folio = page_folio(p); res = -EHWPOISON; kill_procs_now(p, pfn, flags, folio); put_page(p); @@ -2372,7 +2487,7 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = folio->flags; + page_flags = folio->flags.f; /* * __munlock_folio() may clear a writeback folio's LRU flag without @@ -2495,19 +2610,6 @@ static void memory_failure_work_func(struct work_struct *work) } } -/* - * Process memory_failure work queued on the specified CPU. - * Used to avoid return-to-userspace racing with the memory_failure workqueue. - */ -void memory_failure_queue_kick(int cpu) -{ - struct memory_failure_cpu *mf_cpu; - - mf_cpu = &per_cpu(memory_failure_cpu, cpu); - cancel_work_sync(&mf_cpu->work); - memory_failure_work_func(&mf_cpu->work); -} - static int __init memory_failure_init(void) { struct memory_failure_cpu *mf_cpu; @@ -2556,10 +2658,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2676,7 +2777,17 @@ static int soft_offline_in_use_page(struct page *page) }; if (!huge && folio_test_large(folio)) { - if (try_to_split_thp_page(page, true)) { + const int new_order = min_order_for_split(folio); + + /* + * If new_order (target split order) is not 0, do not split the + * folio at all to retain the still accessible large folio. + * NOTE: if minimizing the number of soft offline pages is + * preferred, split it to non-zero new_order like it is done in + * memory_failure(). + */ + if (new_order || try_to_split_thp_page(page, /* new_order= */ 0, + /* release= */ true)) { pr_info("%#lx: thp split failed\n", pfn); return -EBUSY; } @@ -2731,13 +2842,13 @@ static int soft_offline_in_use_page(struct page *page) putback_movable_pages(&pagelist); pr_info("%#lx: %s migration failed %ld, type %pGp\n", - pfn, msg_page[huge], ret, &page->flags); + pfn, msg_page[huge], ret, &page->flags.f); if (ret > 0) ret = -EBUSY; } } else { pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", - pfn, msg_page[huge], page_count(page), &page->flags); + pfn, msg_page[huge], page_count(page), &page->flags.f); ret = -EBUSY; } return ret; |
