diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/bootmem_info.c | 7 | ||||
-rw-r--r-- | mm/filemap.c | 1036 | ||||
-rw-r--r-- | mm/folio-compat.c | 11 | ||||
-rw-r--r-- | mm/huge_memory.c | 18 | ||||
-rw-r--r-- | mm/internal.h | 14 | ||||
-rw-r--r-- | mm/kasan/common.c | 27 | ||||
-rw-r--r-- | mm/kasan/generic.c | 8 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 1 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 2 | ||||
-rw-r--r-- | mm/kasan/report.c | 13 | ||||
-rw-r--r-- | mm/kasan/report_tags.c | 10 | ||||
-rw-r--r-- | mm/kfence/core.c | 17 | ||||
-rw-r--r-- | mm/kfence/kfence_test.c | 6 | ||||
-rw-r--r-- | mm/khugepaged.c | 12 | ||||
-rw-r--r-- | mm/memcontrol.c | 55 | ||||
-rw-r--r-- | mm/memory-failure.c | 19 | ||||
-rw-r--r-- | mm/memory.c | 49 | ||||
-rw-r--r-- | mm/memremap.c | 57 | ||||
-rw-r--r-- | mm/migrate.c | 29 | ||||
-rw-r--r-- | mm/page-writeback.c | 6 | ||||
-rw-r--r-- | mm/readahead.c | 24 | ||||
-rw-r--r-- | mm/shmem.c | 174 | ||||
-rw-r--r-- | mm/slab.c | 456 | ||||
-rw-r--r-- | mm/slab.h | 302 | ||||
-rw-r--r-- | mm/slab_common.c | 14 | ||||
-rw-r--r-- | mm/slob.c | 62 | ||||
-rw-r--r-- | mm/slub.c | 1177 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 26 | ||||
-rw-r--r-- | mm/truncate.c | 304 | ||||
-rw-r--r-- | mm/usercopy.c | 13 | ||||
-rw-r--r-- | mm/zsmalloc.c | 18 |
33 files changed, 2098 insertions, 1873 deletions
diff --git a/mm/Makefile b/mm/Makefile index 5c5a3a480fa6..588d3113f3b0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -15,6 +15,8 @@ KCSAN_SANITIZE_slab_common.o := n KCSAN_SANITIZE_slab.o := n KCSAN_SANITIZE_slub.o := n KCSAN_SANITIZE_page_alloc.o := n +# But enable explicit instrumentation for memory barriers. +KCSAN_INSTRUMENT_BARRIERS := y # These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index f03f42f426f6..f18a631e7479 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -15,7 +15,7 @@ void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { - page->freelist = (void *)type; + page->index = type; SetPagePrivate(page); set_page_private(page, info); page_ref_inc(page); @@ -23,14 +23,13 @@ void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) void put_page_bootmem(struct page *page) { - unsigned long type; + unsigned long type = page->index; - type = (unsigned long) page->freelist; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { - page->freelist = NULL; + page->index = 0; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/filemap.c b/mm/filemap.c index 39c4c46c6133..2fd9b2f24025 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,99 +121,97 @@ */ static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) + struct folio *folio, void *shadow) { - XA_STATE(xas, &mapping->i_pages, page->index); - unsigned int nr = 1; + XA_STATE(xas, &mapping->i_pages, folio->index); + long nr = 1; mapping_set_update(&xas, mapping); /* hugetlb pages are represented by a single entry in the xarray */ - if (!PageHuge(page)) { - xas_set_order(&xas, page->index, compound_order(page)); - nr = compound_nr(page); + if (!folio_test_hugetlb(folio)) { + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); } - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(nr != 1 && shadow, page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); - page->mapping = NULL; + folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } -static void unaccount_page_cache_page(struct address_space *mapping, - struct page *page) +static void filemap_unaccount_folio(struct address_space *mapping, + struct folio *folio) { - int nr; + long nr; /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave * stale data around in the cleancache once our page is gone */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); + if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio)) + cleancache_put_page(&folio->page); else - cleancache_invalidate_page(mapping, page); + cleancache_invalidate_page(mapping, &folio->page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(page_mapped(page), page); - if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + VM_BUG_ON_FOLIO(folio_mapped(folio), folio); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { int mapcount; pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", - current->comm, page_to_pfn(page)); - dump_page(page, "still mapped when deleted"); + current->comm, folio_pfn(folio)); + dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - mapcount = page_mapcount(page); + mapcount = page_mapcount(&folio->page); if (mapping_exiting(mapping) && - page_count(page) >= mapcount + 2) { + folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's - * a good bet that actually the page is unmapped, + * a good bet that actually the folio is unmapped, * and we'd prefer not to leak it: if we're wrong, * some other bad page check should catch it later. */ - page_mapcount_reset(page); - page_ref_sub(page, mapcount); + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); } } - /* hugetlb pages do not participate in page cache accounting. */ - if (PageHuge(page)) + /* hugetlb folios do not participate in page cache accounting. */ + if (folio_test_hugetlb(folio)) return; - nr = thp_nr_pages(page); + nr = folio_nr_pages(folio); - __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); - if (PageSwapBacked(page)) { - __mod_lruvec_page_state(page, NR_SHMEM, -nr); - if (PageTransHuge(page)) - __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); - } else if (PageTransHuge(page)) { - __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + } else if (folio_test_pmd_mappable(folio)) { + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* - * At this point page must be either written or cleaned by - * truncate. Dirty page here signals a bug and loss of + * At this point folio must be either written or cleaned by + * truncate. Dirty folio here signals a bug and loss of * unwritten data. * - * This fixes dirty accounting after removing the page entirely - * but leaves PageDirty set: it has no effect for truncated - * page and anyway will be cleared before returning page into + * This fixes dirty accounting after removing the folio entirely + * but leaves the dirty flag set: it has no effect for truncated + * folio and anyway will be cleared before returning folio to * buddy allocator. */ - if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); + if (WARN_ON_ONCE(folio_test_dirty(folio))) + folio_account_cleaned(folio, mapping, + inode_to_wb(mapping->host)); } /* @@ -221,87 +219,83 @@ static void unaccount_page_cache_page(struct address_space *mapping, * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __filemap_remove_folio(struct folio *folio, void *shadow) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; - trace_mm_filemap_delete_from_page_cache(page); - - unaccount_page_cache_page(mapping, page); - page_cache_delete(mapping, page, shadow); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); + page_cache_delete(mapping, folio, shadow); } -static void page_cache_free_page(struct address_space *mapping, - struct page *page) +void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); freepage = mapping->a_ops->freepage; if (freepage) - freepage(page); + freepage(&folio->page); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, thp_nr_pages(page)); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) { + folio_ref_sub(folio, folio_nr_pages(folio)); + VM_BUG_ON_FOLIO(folio_ref_count(folio) <= 0, folio); } else { - put_page(page); + folio_put(folio); } } /** - * delete_from_page_cache - delete page from page cache - * @page: the page which the kernel is trying to remove from page cache + * filemap_remove_folio - Remove folio from page cache. + * @folio: The folio. * - * This must be called only on pages that have been verified to be in the page - * cache and locked. It will never put the page into the free list, the caller - * has a reference on the page. + * This must be called only on folios that are locked and have been + * verified to be in the page cache. It will never put the folio into + * the free list because the caller has a reference on the page. */ -void delete_from_page_cache(struct page *page) +void filemap_remove_folio(struct folio *folio) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping = folio->mapping; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - __delete_from_page_cache(page, NULL); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - page_cache_free_page(mapping, page); + filemap_free_folio(mapping, folio); } -EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_delete_batch - delete several pages from page cache - * @mapping: the mapping to which pages belong - * @pvec: pagevec with pages to delete + * page_cache_delete_batch - delete several folios from page cache + * @mapping: the mapping to which folios belong + * @fbatch: batch of folios to delete * - * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index - * and is optimised for it to be dense. - * It tolerates holes in @pvec (mapping entries at those indices are not - * modified). The function expects only THP head pages to be present in the - * @pvec. + * The function walks over mapping->i_pages and removes folios passed in + * @fbatch from the mapping. The function expects @fbatch to be sorted + * by page index and is optimised for it to be dense. + * It tolerates holes in @fbatch (mapping entries at those indices are not + * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); - int total_pages = 0; + XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); + long total_pages = 0; int i = 0; - struct page *page; + struct folio *folio; mapping_set_update(&xas, mapping); - xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec)) + xas_for_each(&xas, folio, ULONG_MAX) { + if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our @@ -310,54 +304,48 @@ static void page_cache_delete_batch(struct address_space *mapping, * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, - page); + if (folio != fbatch->folios[i]) { + VM_BUG_ON_FOLIO(folio->index > + fbatch->folios[i]->index, folio); continue; } - WARN_ON_ONCE(!PageLocked(page)); + WARN_ON_ONCE(!folio_test_locked(folio)); - if (page->index == xas.xa_index) - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies on it */ + folio->mapping = NULL; + /* Leave folio->index set: truncation lookup relies on it */ - /* - * Move to the next page in the vector if this is a regular - * page or the index is of the last sub-page of this compound - * page. - */ - if (page->index + compound_nr(page) - 1 == xas.xa_index) - i++; + i++; xas_store(&xas, NULL); - total_pages++; + total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec) + struct folio_batch *fbatch) { int i; - if (!pagevec_count(pvec)) + if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - for (i = 0; i < pagevec_count(pvec); i++) { - trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - unaccount_page_cache_page(mapping, pvec->pages[i]); + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); } - page_cache_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - for (i = 0; i < pagevec_count(pvec); i++) - page_cache_free_page(mapping, pvec->pages[i]); + for (i = 0; i < folio_batch_count(fbatch); i++) + filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) @@ -646,8 +634,8 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -static bool filemap_range_has_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -667,34 +655,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping, } rcu_read_unlock(); return page != NULL; - -} - -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping: address space within which to check - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, - loff_t start_byte, loff_t end_byte) -{ - if (!mapping_needs_writeback(mapping)) - return false; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return false; - return filemap_range_has_writeback(mapping, start_byte, end_byte); } -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); +EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range @@ -959,7 +921,7 @@ unlock: goto error; } - trace_mm_filemap_add_to_page_cache(&folio->page); + trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; @@ -1259,10 +1221,10 @@ enum behavior { * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like - * wait_on_page_writeback() waiting on PG_writeback. + * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, - * like put_and_wait_on_page_locked() on PG_locked. + * like folio_put_wait_locked() on PG_locked. */ }; @@ -1439,22 +1401,21 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr) EXPORT_SYMBOL(folio_wait_bit_killable); /** - * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked - * @page: The page to wait for. + * folio_put_wait_locked - Drop a reference and wait for it to be unlocked + * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * - * The caller should hold a reference on @page. They expect the page to + * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration - * (for example) by holding the reference while waiting for the page to + * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not - * dereference @page. + * dereference @folio. * - * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. + * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int put_and_wait_on_page_locked(struct page *page, int state) +int folio_put_wait_locked(struct folio *folio, int state) { - return folio_wait_bit_common(page_folio(page), PG_locked, state, - DROP); + return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** @@ -1979,37 +1940,36 @@ no_page: } EXPORT_SYMBOL(__filemap_get_folio); -static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, +static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { - struct page *page; + struct folio *folio; retry: if (mark == XA_PRESENT) - page = xas_find(xas, max); + folio = xas_find(xas, max); else - page = xas_find_marked(xas, max, mark); + folio = xas_find_marked(xas, max, mark); - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ - if (!page || xa_is_value(page)) - return page; + if (!folio || xa_is_value(folio)) + return folio; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto reset; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) { - put_page(page); + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); goto reset; } - return page; + return folio; reset: xas_reset(xas); goto retry; @@ -2020,56 +1980,36 @@ reset: * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. + * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in - * the mapping. The entries are placed in @pvec. find_get_entries() - * takes a reference on any actual pages it returns. + * the mapping. The entries are placed in @fbatch. find_get_entries() + * takes a reference on any actual folios it returns. * - * The search returns a group of mapping-contiguous page cache entries - * with ascending indexes. There may be holes in the indices due to - * not-present pages. + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries or large folios. * - * Any shadow entries of evicted pages, or swap entries from + * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * - * If it finds a Transparent Huge Page, head or tail, find_get_entries() - * stops at that page: the caller is likely to have a better way to handle - * the compound page as a whole, and then skip its extent, than repeatedly - * calling find_get_entries() to return all its tails. - * - * Return: the number of pages and shadow entries which were found. + * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - unsigned nr_entries = PAGEVEC_SIZE; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - /* - * Terminate early on finding a THP, to allow the caller to - * handle it all at once; but continue if this is hugetlbfs. - */ - if (!xa_is_value(page) && PageTransHuge(page) && - !PageHuge(page)) { - page = find_subpage(page, xas.xa_index); - nr_entries = ret + 1; - } - - indices[ret] = xas.xa_index; - pvec->pages[ret] = page; - if (++ret == nr_entries) + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; } rcu_read_unlock(); - pvec->nr = ret; - return ret; + return folio_batch_count(fbatch); } /** @@ -2077,63 +2017,64 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). - * @pvec: Where the resulting entries are placed. - * @indices: The cache indices of the entries in @pvec. + * @fbatch: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. - * Swap, shadow and DAX entries are included. Pages are returned - * locked and with an incremented refcount. Pages which are locked by - * somebody else or under writeback are skipped. Only the head page of - * a THP is returned. Pages which are partially outside the range are - * not returned. + * Swap, shadow and DAX entries are included. Folios are returned + * locked and with an incremented refcount. Folios which are locked + * by somebody else or under writeback are skipped. Folios which are + * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive - * due to not-present entries, THP pages, pages which could not be locked - * or pages under writeback. + * due to not-present entries, large folios, folios which could not be + * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices) + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { - if (!xa_is_value(page)) { - if (page->index < start) + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(folio)) { + if (folio->index < start) goto put; - if (page->index + thp_nr_pages(page) - 1 > end) + if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto put; - if (page->mapping != mapping || PageWriteback(page)) + if (folio->mapping != mapping || + folio_test_writeback(folio)) goto unlock; - VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), - page); + VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), + folio); } - indices[pvec->nr] = xas.xa_index; - if (!pagevec_add(pvec, page)) + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) break; - goto next; + continue; unlock: - unlock_page(page); + folio_unlock(folio); put: - put_page(page); -next: - if (!xa_is_value(page) && PageTransHuge(page)) { - unsigned int nr_pages = thp_nr_pages(page); - - /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ - xas_set(&xas, page->index + nr_pages); - if (xas.xa_index < nr_pages) - break; - } + folio_put(folio); } rcu_read_unlock(); - return pagevec_count(pvec); + return folio_batch_count(fbatch); +} + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + return false; + if (index >= max) + return false; + return index < folio->index + folio_nr_pages(folio) - 1; } /** @@ -2162,23 +2103,29 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *start); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } + if (folio_more_pages(folio, xas.xa_index, end)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } } /* @@ -2213,36 +2160,41 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { XA_STATE(xas, &mapping->i_pages, index); - struct page *page; + struct folio *folio; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - for (page = xas_load(&xas); page; page = xas_next(&xas)) { - if (xas_retry(&xas, page)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) break; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(folio != xas_reload(&xas))) goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); +again: + pages[ret] = folio_file_page(folio, xas.xa_index); if (++ret == nr_pages) break; + if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } continue; put_page: - put_page(page); + folio_put(folio); retry: xas_reset(&xas); } @@ -2271,25 +2223,25 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, struct page **pages) { XA_STATE(xas, &mapping->i_pages, *index); - struct page *page; + struct folio *folio; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - while ((page = find_get_entry(&xas, end, tag))) { + while ((folio = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - pages[ret] = page; + pages[ret] = &folio->page; if (++ret == nr_pages) { - *index = page->index + thp_nr_pages(page); + *index = folio->index + folio_nr_pages(folio); goto out; } } @@ -2332,52 +2284,50 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) } /* - * filemap_get_read_batch - Get a batch of pages for read + * filemap_get_read_batch - Get a batch of folios for read * - * Get a batch of pages which represent a contiguous range of bytes - * in the file. No tail pages will be returned. If @index is in the - * middle of a THP, the entire THP will be returned. The last page in - * the batch may have Readahead set or be not Uptodate so that the - * caller can take the appropriate action. + * Get a batch of folios which represent a contiguous range of bytes in + * the file. No exceptional entries will be returned. If @index is in + * the middle of a folio, the entire folio will be returned. The last + * folio in the batch may have the readahead flag set or the uptodate flag + * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, - pgoff_t index, pgoff_t max, struct pagevec *pvec) + pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); - struct page *head; + struct folio *folio; rcu_read_lock(); - for (head = xas_load(&xas); head; head = xas_next(&xas)) { - if (xas_retry(&xas, head)) + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) continue; - if (xas.xa_index > max || xa_is_value(head)) + if (xas.xa_index > max || xa_is_value(folio)) break; - if (!page_cache_get_speculative(head)) + if (!folio_try_get_rcu(folio)) goto retry; - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto put_page; + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; - if (!pagevec_add(pvec, head)) + if (!folio_batch_add(fbatch, folio)) break; - if (!PageUptodate(head)) + if (!folio_test_uptodate(folio)) break; - if (PageReadahead(head)) + if (folio_test_readahead(folio)) break; - xas.xa_index = head->index + thp_nr_pages(head) - 1; - xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); continue; -put_page: - put_page(head); +put_folio: + folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } -static int filemap_read_page(struct file *file, struct address_space *mapping, - struct page *page) +static int filemap_read_folio(struct file *file, struct address_space *mapping, + struct folio *folio) { int error; @@ -2386,52 +2336,51 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, * eg. multipath errors. PG_error will be set again if readpage * fails. */ - ClearPageError(page); + folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(file, page); + error = mapping->a_ops->readpage(file, &folio->page); if (error) return error; - error = wait_on_page_locked_killable(page); + error = folio_wait_locked_killable(folio); if (error) return error; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct page *page) + loff_t pos, struct iov_iter *iter, struct folio *folio) { int count; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (iov_iter_is_pipe(iter)) return false; if (!mapping->a_ops->is_partially_uptodate) return false; - if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + if (mapping->host->i_blkbits >= folio_shift(folio)) return false; count = iter->count; - if (page_offset(page) > pos) { - count -= page_offset(page) - pos; + if (folio_pos(folio) > pos) { + count -= folio_pos(folio) - pos; pos = 0; } else { - pos -= page_offset(page); + pos -= folio_pos(folio); } - return mapping->a_ops->is_partially_uptodate(page, pos, count); + return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); int error; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2447,7 +2396,11 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); - put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); + /* + * This is where we usually end up waiting for a + * previously submitted readahead to finish. + */ + folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); @@ -2460,14 +2413,14 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; - error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); + error = filemap_read_folio(iocb->ki_filp, mapping, folio); goto unlock_mapping; unlock: folio_unlock(folio); @@ -2478,70 +2431,72 @@ unlock_mapping: return error; } -static int filemap_create_page(struct file *file, +static int filemap_create_folio(struct file *file, struct address_space *mapping, pgoff_t index, - struct pagevec *pvec) + struct folio_batch *fbatch) { - struct page *page; + struct folio *folio; int error; - page = page_cache_alloc(mapping); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + if (!folio) return -ENOMEM; /* - * Protect against truncate / hole punch. Grabbing invalidate_lock here - * assures we cannot instantiate and bring uptodate new pagecache pages - * after evicting page cache during truncate and before actually - * freeing blocks. Note that we could release invalidate_lock after - * inserting the page into page cache as the locked page would then be - * enough to synchronize with hole punching. But there are code paths - * such as filemap_update_page() filling in partially uptodate pages or - * ->readpages() that need to hold invalidate_lock while mapping blocks - * for IO so let's hold the lock here as well to keep locking rules - * simple. + * Protect against truncate / hole punch. Grabbing invalidate_lock + * here assures we cannot instantiate and bring uptodate new + * pagecache folios after evicting page cache during truncate + * and before actually freeing blocks. Note that we could + * release invalidate_lock after inserting the folio into + * the page cache as the locked folio would then be enough to + * synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate + * pages or ->readpages() that need to hold invalidate_lock + * while mapping blocks for IO so let's hold the lock here as + * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - error = add_to_page_cache_lru(page, mapping, index, + error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); - pagevec_add(pvec, page); + folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); - put_page(page); + folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, - struct address_space *mapping, struct page *page, + struct address_space *mapping, struct folio *folio, pgoff_t last_index) { + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_async_readahead(mapping, &file->f_ra, file, page, - page->index, last_index - page->index); + page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct pagevec *pvec) + struct folio_batch *fbatch) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; - struct page *page; + struct folio *folio; int err = 0; last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); @@ -2549,34 +2504,35 @@ retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, pvec); - if (!pagevec_count(pvec)) { + filemap_get_read_batch(mapping, index, last_index, fbatch); + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, pvec); + filemap_get_read_batch(mapping, index, last_index, fbatch); } - if (!pagevec_count(pvec)) { + if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - err = filemap_create_page(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, pvec); + err = filemap_create_folio(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } - page = pvec->pages[pagevec_count(pvec) - 1]; - if (PageReadahead(page)) { - err = filemap_readahead(iocb, filp, mapping, page, last_index); + folio = fbatch->folios[folio_batch_count(fbatch) - 1]; + if (folio_test_readahead(folio)) { + err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + if (!folio_test_uptodate(folio)) { + if ((iocb->ki_flags & IOCB_WAITQ) && + folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, page); + err = filemap_update_page(iocb, mapping, iter, folio); if (err) goto err; } @@ -2584,8 +2540,8 @@ retry: return 0; err: if (err < 0) - put_page(page); - if (likely(--pvec->nr)) + folio_put(folio); + if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; @@ -2612,7 +2568,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct pagevec pvec; + struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2623,7 +2579,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { cond_resched(); @@ -2639,7 +2595,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &pvec); + error = filemap_get_pages(iocb, iter, &fbatch); if (error < 0) break; @@ -2653,7 +2609,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) - goto put_pages; + goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* @@ -2668,33 +2624,29 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pvec.pages[0]); + folio_mark_accessed(fbatch.folios[0]); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t page_size = thp_size(page); - size_t offset = iocb->ki_pos & (page_size - 1); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t fsize = folio_size(folio); + size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, - page_size - offset); + fsize - offset); size_t copied; - if (end_offset < page_offset(page)) + if (end_offset < folio_pos(folio)) break; if (i > 0) - mark_page_accessed(page); + folio_mark_accessed(folio); /* - * If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. */ - if (writably_mapped) { - int j; + if (writably_mapped) + flush_dcache_folio(folio); - for (j = 0; j < thp_nr_pages(page); j++) - flush_dcache_page(page + j); - } - - copied = copy_page_to_iter(page, offset, bytes, iter); + copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; @@ -2705,10 +2657,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, break; } } -put_pages: - for (i = 0; i < pagevec_count(&pvec); i++) - put_page(pvec.pages[i]); - pagevec_reinit(&pvec); +put_folios: + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_put(fbatch.folios[i]); + folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); @@ -2793,44 +2745,44 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); -static inline loff_t page_seek_hole_data(struct xa_state *xas, - struct address_space *mapping, struct page *page, +static inline loff_t folio_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); - if (xa_is_value(page) || PageUptodate(page)) + if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); - lock_page(page); - if (unlikely(page->mapping != mapping)) + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) goto unlock; - offset = offset_in_thp(page, start) & ~(bsz - 1); + offset = offset_in_folio(folio, start) & ~(bsz - 1); do { - if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + if (ops->is_partially_uptodate(&folio->page, offset, bsz) == + seek_data) break; start = (start + bsz) & ~(bsz - 1); offset += bsz; - } while (offset < thp_size(page)); + } while (offset < folio_size(folio)); unlock: - unlock_page(page); + folio_unlock(folio); rcu_read_lock(); return start; } -static inline -unsigned int seek_page_size(struct xa_state *xas, struct page *page) +static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { - if (xa_is_value(page)) + if (xa_is_value(folio)) return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); - return thp_size(page); + return folio_size(folio); } /** @@ -2857,15 +2809,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); - struct page *page; + struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); - while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; - unsigned int seek_size; + size_t seek_size; if (start < pos) { if (!seek_data) @@ -2873,9 +2825,9 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - seek_size = seek_page_size(&xas, page); - pos = round_up(pos + 1, seek_size); - start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_size = seek_folio_size(&xas, folio); + pos = round_up((u64)pos + 1, seek_size); + start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; @@ -2883,15 +2835,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); - if (!xa_is_value(page)) - put_page(page); + if (!xa_is_value(folio)) + folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); - if (page && !xa_is_value(page)) - put_page(page); + if (folio && !xa_is_value(folio)) + folio_put(folio); if (start > end) return end; return start; @@ -2900,21 +2852,20 @@ unlock: #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock + * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. - * @page - the page to lock. + * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * - * This works similar to lock_page_or_retry in that it can drop the mmap_lock. - * It differs in that it actually returns the page locked if it returns 1 and 0 - * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin - * will point to the pinned file and needs to be fput()'ed at a later point. + * This works similar to lock_folio_or_retry in that it can drop the + * mmap_lock. It differs in that it actually returns the folio locked + * if it returns 1 and 0 if it couldn't lock the folio. If we did have + * to drop the mmap_lock then fpin will point to the pinned file and + * needs to be fput()'ed at a later point. */ -static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, +static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { - struct folio *folio = page_folio(page); - if (folio_trylock(folio)) return 1; @@ -3003,25 +2954,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) + struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; - struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; - pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; + mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); - if (PageReadahead(page)) { + + if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_async_readahead(mapping, ra, file, - page, offset, ra->ra_pages); + page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } @@ -3040,7 +2991,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock - * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). + * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. @@ -3056,28 +3007,27 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t offset = vmf->pgoff; - pgoff_t max_off; - struct page *page; + pgoff_t max_idx, index = vmf->pgoff; + struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ - page = find_get_page(mapping, offset); - if (likely(page)) { + folio = filemap_get_folio(mapping, index); + if (likely(folio)) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) - fpin = do_async_mmap_readahead(vmf, page); - if (unlikely(!PageUptodate(page))) { + fpin = do_async_mmap_readahead(vmf, folio); + if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } @@ -3089,17 +3039,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) fpin = do_sync_mmap_readahead(vmf); retry_find: /* - * See comment in filemap_create_page() why we need + * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } - page = pagecache_get_page(mapping, offset, + folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) { + if (!folio) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); @@ -3107,22 +3057,22 @@ retry_find: } } - if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { - unlock_page(page); - put_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto retry_find; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { /* * The page was in cache and uptodate and now it is not. * Strange but possible since we didn't hold the page lock all @@ -3130,8 +3080,8 @@ retry_find: * try again. */ if (!mapping_locked) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto retry_find; } goto page_not_uptodate; @@ -3143,7 +3093,7 @@ retry_find: * redo the fault. */ if (fpin) { - unlock_page(page); + folio_unlock(folio); goto out_retry; } if (mapping_locked) @@ -3153,14 +3103,14 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) { - unlock_page(page); - put_page(page); + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) { + folio_unlock(folio); + folio_put(folio); return VM_FAULT_SIGBUS; } - vmf->page = page; + vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: @@ -3171,10 +3121,10 @@ page_not_uptodate: * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = filemap_read_page(file, mapping, page); + error = filemap_read_folio(file, mapping, folio); if (fpin) goto out_retry; - put_page(page); + folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; @@ -3188,8 +3138,8 @@ out_retry: * re-find the vma and come back and find our hopefully still populated * page. */ - if (page) - put_page(page); + if (folio) + folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) @@ -3231,48 +3181,48 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) return false; } -static struct page *next_uptodate_page(struct page *page, +static struct folio *next_uptodate_page(struct folio *folio, struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { unsigned long max_idx; do { - if (!page) + if (!folio) return NULL; - if (xas_retry(xas, page)) + if (xas_retry(xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageLocked(page)) + if (folio_test_locked(folio)) continue; - if (!page_cache_get_speculative(page)) + if (!folio_try_get_rcu(folio)) continue; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(xas))) + if (unlikely(folio != xas_reload(xas))) goto skip; - if (!PageUptodate(page) || PageReadahead(page)) + if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto skip; - if (page->mapping != mapping) + if (folio->mapping != mapping) goto unlock; - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; - return page; + return folio; unlock: - unlock_page(page); + folio_unlock(folio); skip: - put_page(page); - } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + folio_put(folio); + } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } -static inline struct page *first_map_page(struct address_space *mapping, +static inline struct folio *first_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3280,7 +3230,7 @@ static inline struct page *first_map_page(struct address_space *mapping, mapping, xas, end_pgoff); } -static inline struct page *next_map_page(struct address_space *mapping, +static inline struct folio *next_map_page(struct address_space *mapping, struct xa_state *xas, pgoff_t end_pgoff) { @@ -3297,16 +3247,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct folio *folio; + struct page *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); vm_fault_t ret = 0; rcu_read_lock(); - head = first_map_page(mapping, &xas, end_pgoff); - if (!head) + folio = first_map_page(mapping, &xas, end_pgoff); + if (!folio) goto out; - if (filemap_map_pmd(vmf, head)) { + if (filemap_map_pmd(vmf, &folio->page)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3314,7 +3265,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); do { - page = find_subpage(head, xas.xa_index); +again: + page = folio_file_page(folio, xas.xa_index); if (PageHWPoison(page)) goto unlock; @@ -3335,12 +3287,21 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, do_set_pte(vmf, page, addr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, addr, vmf->pte); - unlock_page(head); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } + folio_unlock(folio); continue; unlock: - unlock_page(head); - put_page(head); - } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + goto again; + } + folio_unlock(folio); + folio_put(folio); + } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); @@ -3352,24 +3313,24 @@ EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* - * We mark the page dirty already here so that when freeze is in + * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will - * see the dirty page and writeprotect it again. + * see the dirty folio and writeprotect it again. */ - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; @@ -3422,35 +3383,20 @@ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static struct page *wait_on_page_read(struct page *page) +static struct folio *do_read_cache_folio(struct address_space *mapping, + pgoff_t index, filler_t filler, void *data, gfp_t gfp) { - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - put_page(page); - page = ERR_PTR(-EIO); - } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) -{ - struct page *page; + struct folio *folio; int err; repeat: - page = find_get_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp); - if (!page) + folio = filemap_get_folio(mapping, index); + if (!folio) { + folio = filemap_alloc_folio(gfp, 0); + if (!folio) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, gfp); + err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { - put_page(page); + folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ @@ -3459,71 +3405,41 @@ repeat: filler: if (filler) - err = filler(data, page); + err = filler(data, &folio->page); else - err = mapping->a_ops->readpage(data, page); + err = mapping->a_ops->readpage(data, &folio->page); if (err < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(err); } - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; + folio_wait_locked(folio); + if (!folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + goto out; } - if (PageUptodate(page)) - goto out; - - /* - * Page is not up to date and may be locked due to one of the following - * case a: Page is being filled and the page lock is held - * case b: Read/write error clearing the page uptodate status - * case c: Truncation in progress (page locked) - * case d: Reclaim in progress - * - * Case a, the page will be up to date when the page is unlocked. - * There is no need to serialise on the page lock here as the page - * is pinned so the lock gives no additional protection. Even if the - * page is truncated, the data is still valid if PageUptodate as - * it's a race vs truncate race. - * Case b, the page will not be up to date - * Case c, the page may be truncated but in itself, the data may still - * be valid after IO completes as it's a read vs truncate race. The - * operation must restart if the page is not uptodate on unlock but - * otherwise serialising on page lock to stabilise the mapping gives - * no additional guarantees to the caller as the page lock is - * released before return. - * Case d, similar to truncation. If reclaim holds the page lock, it - * will be a race with remove_mapping that determines if the mapping - * is valid on unlock but otherwise the data is valid and there is - * no need to serialise with page lock. - * - * As the page lock gives no additional guarantee, we optimistically - * wait on the page to be unlocked and check if it's up to date and - * use the page if it is. Otherwise, the page lock is required to - * distinguish between the different cases. The motivation is that we - * avoid spurious serialisations and wakeups when multiple processes - * wait on the same page for IO to complete. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - /* Distinguish between all the cases under the safety of the lock */ - lock_page(page); + if (!folio_trylock(folio)) { + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* Case c or d, restart the operation */ - if (!page->mapping) { - unlock_page(page); - put_page(page); + /* Folio was truncated from mapping */ + if (!folio->mapping) { + folio_unlock(folio); + folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); goto out; } @@ -3533,16 +3449,16 @@ filler: * Clear page error before actual read, PG_error will be * set again if read page fails. */ - ClearPageError(page); + folio_clear_error(folio); goto filler; out: - mark_page_accessed(page); - return page; + folio_mark_accessed(folio); + return folio; } /** - * read_cache_page - read into page cache, fill it if needed + * read_cache_folio - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read @@ -3557,10 +3473,27 @@ out: * * Return: up to date page on success, ERR_PTR() on failure. */ +struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, + filler_t filler, void *data) +{ + return do_read_cache_folio(mapping, index, filler, data, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_folio); + +static struct page *do_read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, void *data, gfp_t gfp) +{ + struct folio *folio; + + folio = do_read_cache_folio(mapping, index, filler, data, gfp); + if (IS_ERR(folio)) + return &folio->page; + return folio_file_page(folio, index); +} + struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) + pgoff_t index, filler_t *filler, void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -3920,33 +3853,32 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) EXPORT_SYMBOL(generic_file_write_iter); /** - * try_to_release_page() - release old fs-specific metadata on a page - * - * @page: the page which the kernel is trying to free - * @gfp_mask: memory allocation flags (and I/O mode) + * filemap_release_folio() - Release fs-specific metadata on a folio. + * @folio: The folio which the kernel is trying to free. + * @gfp: Memory allocation flags (and I/O mode). * - * The address_space is to try to release any data against the page - * (presumably at page->private). + * The address_space is trying to release any data attached to a folio + * (presumably at folio->private). * - * This may also be called if PG_fscache is set on a page, indicating that the - * page is known to the local caching routines. + * This will also be called if the private_2 flag is set on a page, + * indicating that the folio has other metadata associated with it. * - * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). + * The @gfp argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block + * (__GFP_RECLAIM & __GFP_FS). * - * Return: %1 if the release was successful, otherwise return zero. + * Return: %true if the release was successful, otherwise %false. */ -int try_to_release_page(struct page *page, gfp_t gfp_mask) +bool filemap_release_folio(struct folio *folio, gfp_t gfp) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = folio->mapping; - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; + BUG_ON(!folio_test_locked(folio)); + if (folio_test_writeback(folio)) + return false; if (mapping && mapping->a_ops->releasepage) - return mapping->a_ops->releasepage(page, gfp_mask); - return try_to_free_buffers(page); + return mapping->a_ops->releasepage(&folio->page, gfp); + return try_to_free_buffers(&folio->page); } - -EXPORT_SYMBOL(try_to_release_page); +EXPORT_SYMBOL(filemap_release_folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 5b6ae1da314e..749555a232a8 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -140,3 +140,14 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); + +void delete_from_page_cache(struct page *page) +{ + return filemap_remove_folio(page_folio(page)); +} + +int try_to_release_page(struct page *page, gfp_t gfp) +{ + return filemap_release_folio(page_folio(page), gfp); +} +EXPORT_SYMBOL(try_to_release_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ed86a8f6a5b..406a3c28c026 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2604,6 +2604,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct deferred_split *ds_queue = get_deferred_split_queue(head); + XA_STATE(xas, &head->mapping->i_pages, head->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -2642,6 +2643,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } + xas_split_alloc(&xas, head, compound_order(head), + mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + if (xas_error(&xas)) { + ret = xas_error(&xas); + goto out; + } + anon_vma = NULL; i_mmap_lock_read(mapping); @@ -2671,13 +2679,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { - XA_STATE(xas, &mapping->i_pages, page_index(head)); - /* * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - xa_lock(&mapping->i_pages); + xas_lock(&xas); + xas_reset(&xas); if (xas_load(&xas) != head) goto fail; } @@ -2693,6 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { int nr = thp_nr_pages(head); + xas_split(&xas, head, thp_order(head)); if (PageSwapBacked(head)) { __mod_lruvec_page_state(head, NR_SHMEM_THPS, -nr); @@ -2709,7 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); local_irq_enable(); remap_page(head, thp_nr_pages(head)); ret = -EBUSY; @@ -2723,6 +2731,8 @@ out_unlock: if (mapping) i_mmap_unlock_read(mapping); out: + /* Free any memory we didn't use */ + xas_nomem(&xas, 0); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } diff --git a/mm/internal.h b/mm/internal.h index c5834cc28a44..d80300392a19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -12,6 +12,8 @@ #include <linux/pagemap.h> #include <linux/tracepoint-defs.h> +struct folio_batch; + /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints @@ -74,6 +76,7 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma) return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } +struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@ -90,7 +93,13 @@ static inline void force_page_cache_readahead(struct address_space *mapping, } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, - pgoff_t end, struct pagevec *pvec, pgoff_t *indices); + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +void filemap_free_folio(struct address_space *mapping, struct folio *folio); +int truncate_inode_folio(struct address_space *mapping, struct folio *folio); +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, + loff_t end); /** * folio_evictable - Test whether a folio is evictable. @@ -383,6 +392,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU +void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_vma_page_range(struct vm_area_struct *vma, @@ -486,8 +496,8 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } return fpin; } - #else /* !CONFIG_MMU */ +static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 8428da2aaf17..7c06db78a76c 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -247,8 +247,9 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, } #endif -void __kasan_poison_slab(struct page *page) +void __kasan_poison_slab(struct slab *slab) { + struct page *page = slab_page(slab); unsigned long i; for (i = 0; i < compound_nr(page); i++) @@ -298,7 +299,7 @@ static inline u8 assign_tag(struct kmem_cache *cache, /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB /* For SLAB assign tags based on the object index in the freelist. */ - return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object); + return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object); #else /* * For SLUB assign a random tag during slab creation, otherwise reuse @@ -341,7 +342,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (is_kfence_address(object)) return false; - if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != + if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { kasan_report_invalid_free(tagged_object, ip); return true; @@ -401,9 +402,9 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) void __kasan_slab_free_mempool(void *ptr, unsigned long ip) { - struct page *page; + struct folio *folio; - page = virt_to_head_page(ptr); + folio = virt_to_folio(ptr); /* * Even though this function is only called for kmem_cache_alloc and @@ -411,12 +412,14 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) * !PageSlab() when the size provided to kmalloc is larger than * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. */ - if (unlikely(!PageSlab(page))) { + if (unlikely(!folio_test_slab(folio))) { if (____kasan_kfree_large(ptr, ip)) return; - kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false); + kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false); } else { - ____kasan_slab_free(page->slab_cache, ptr, ip, false, false); + struct slab *slab = folio_slab(folio); + + ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false); } } @@ -560,7 +563,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags) { - struct page *page; + struct slab *slab; if (unlikely(object == ZERO_SIZE_PTR)) return (void *)object; @@ -572,13 +575,13 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag */ kasan_unpoison(object, size, false); - page = virt_to_head_page(object); + slab = virt_to_slab(object); /* Piggy-back on kmalloc() instrumentation to poison the redzone. */ - if (unlikely(!PageSlab(page))) + if (unlikely(!slab)) return __kasan_kmalloc_large(object, size, flags); else - return ____kasan_kmalloc(page->slab_cache, object, size, flags); + return ____kasan_kmalloc(slab->slab_cache, object, size, flags); } bool __kasan_check_byte(const void *address, unsigned long ip) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 84a038b07c6f..a25ad4090615 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -330,16 +330,16 @@ DEFINE_ASAN_SET_SHADOW(f8); static void __kasan_record_aux_stack(void *addr, bool can_alloc) { - struct page *page = kasan_addr_to_page(addr); + struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; struct kasan_alloc_meta *alloc_meta; void *object; - if (is_kfence_address(addr) || !(page && PageSlab(page))) + if (is_kfence_address(addr) || !slab) return; - cache = page->slab_cache; - object = nearest_obj(cache, page, addr); + cache = slab->slab_cache; + object = nearest_obj(cache, slab, addr); alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index aebd8df86a1f..c17fa8d26ffe 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -265,6 +265,7 @@ bool kasan_report(unsigned long addr, size_t size, void kasan_report_invalid_free(void *object, unsigned long ip); struct page *kasan_addr_to_page(const void *addr); +struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 47ed4fc33a29..08291ed33e93 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -117,7 +117,7 @@ static unsigned long quarantine_batch_size; static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { - return virt_to_head_page(qlink)->slab_cache; + return virt_to_slab(qlink)->slab_cache; } static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0bc10f452f7e..3ad9624dcc56 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -150,6 +150,14 @@ struct page *kasan_addr_to_page(const void *addr) return NULL; } +struct slab *kasan_addr_to_slab(const void *addr) +{ + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) + return virt_to_slab(addr); + return NULL; +} + static void describe_object_addr(struct kmem_cache *cache, void *object, const void *addr) { @@ -248,8 +256,9 @@ static void print_address_description(void *addr, u8 tag) pr_err("\n"); if (page && PageSlab(page)) { - struct kmem_cache *cache = page->slab_cache; - void *object = nearest_obj(cache, page, addr); + struct slab *slab = page_slab(page); + struct kmem_cache *cache = slab->slab_cache; + void *object = nearest_obj(cache, slab, addr); describe_object(cache, object, addr, tag); } diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 8a319fc16dab..1b41de88c53e 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -12,7 +12,7 @@ const char *kasan_get_bug_type(struct kasan_access_info *info) #ifdef CONFIG_KASAN_TAGS_IDENTIFY struct kasan_alloc_meta *alloc_meta; struct kmem_cache *cache; - struct page *page; + struct slab *slab; const void *addr; void *object; u8 tag; @@ -20,10 +20,10 @@ const char *kasan_get_bug_type(struct kasan_access_info *info) tag = get_tag(info->access_addr); addr = kasan_reset_tag(info->access_addr); - page = kasan_addr_to_page(addr); - if (page && PageSlab(page)) { - cache = page->slab_cache; - object = nearest_obj(cache, page, (void *)addr); + slab = kasan_addr_to_slab(addr); + if (slab) { + cache = slab->slab_cache; + object = nearest_obj(cache, slab, (void *)addr); alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) { diff --git a/mm/kfence/core.c b/mm/kfence/core.c index a19154a8d196..5ad40e3add45 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -360,7 +360,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g { struct kfence_metadata *meta = NULL; unsigned long flags; - struct page *page; + struct slab *slab; void *addr; /* Try to obtain a free object. */ @@ -424,13 +424,14 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g alloc_covered_add(alloc_stack_hash, 1); - /* Set required struct page fields. */ - page = virt_to_page(meta->addr); - page->slab_cache = cache; - if (IS_ENABLED(CONFIG_SLUB)) - page->objects = 1; - if (IS_ENABLED(CONFIG_SLAB)) - page->s_mem = addr; + /* Set required slab fields. */ + slab = virt_to_slab((void *)meta->addr); + slab->slab_cache = cache; +#if defined(CONFIG_SLUB) + slab->objects = 1; +#elif defined(CONFIG_SLAB) + slab->s_mem = addr; +#endif /* Memory initialization. */ for_each_canary(meta, set_canary_byte); diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 695030c1fff8..a22b1af85577 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -282,7 +282,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat alloc = kmalloc(size, gfp); if (is_kfence_address(alloc)) { - struct page *page = virt_to_head_page(alloc); + struct slab *slab = virt_to_slab(alloc); struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]; @@ -291,8 +291,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat * even for KFENCE objects; these are required so that * memcg accounting works correctly. */ - KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U); - KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1); + KUNIT_EXPECT_EQ(test, obj_to_index(s, slab, alloc), 0U); + KUNIT_EXPECT_EQ(test, objs_per_slab(s, slab), 1); if (policy == ALLOCATE_ANY) return alloc; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7af84bac6fc2..35f14d0a00a6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1664,7 +1664,10 @@ static void collapse_file(struct mm_struct *mm, } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); - /* This will be less messy when we use multi-index entries */ + /* + * Ensure we have slots for all the pages in the range. This is + * almost certainly a no-op because most of the pages must be present + */ do { xas_lock_irq(&xas); xas_create_range(&xas); @@ -1889,6 +1892,9 @@ out_unlock: __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } + /* Join all the small entries into a single multi-index entry */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, new_page); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -2011,6 +2017,10 @@ static void khugepaged_scan_file(struct mm_struct *mm, continue; } + /* + * XXX: khugepaged should compact smaller compound pages + * into a PMD sized page + */ if (PageTransCompound(page)) { result = SCAN_PAGE_COMPOUND; break; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c9ddd02dc5de..09d342c7cbd0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2823,31 +2823,31 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, rcu_read_unlock(); } -int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp, bool new_page) +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) { - unsigned int objects = objs_per_slab_page(s, page); + unsigned int objects = objs_per_slab(s, slab); unsigned long memcg_data; void *vec; gfp &= ~OBJCGS_CLEAR_MASK; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, - page_to_nid(page)); + slab_nid(slab)); if (!vec) return -ENOMEM; memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; - if (new_page) { + if (new_slab) { /* - * If the slab page is brand new and nobody can yet access - * it's memcg_data, no synchronization is required and - * memcg_data can be simply assigned. + * If the slab is brand new and nobody can yet access its + * memcg_data, no synchronization is required and memcg_data can + * be simply assigned. */ - page->memcg_data = memcg_data; - } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { + slab->memcg_data = memcg_data; + } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { /* - * If the slab page is already in use, somebody can allocate - * and assign obj_cgroups in parallel. In this case the existing + * If the slab is already in use, somebody can allocate and + * assign obj_cgroups in parallel. In this case the existing * objcg vector should be reused. */ kfree(vec); @@ -2872,38 +2872,43 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, */ struct mem_cgroup *mem_cgroup_from_obj(void *p) { - struct page *page; + struct folio *folio; if (mem_cgroup_disabled()) return NULL; - page = virt_to_head_page(p); + folio = virt_to_folio(p); /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in - * the page->obj_cgroups. + * slab->memcg_data. */ - if (page_objcgs_check(page)) { - struct obj_cgroup *objcg; + if (folio_test_slab(folio)) { + struct obj_cgroup **objcgs; + struct slab *slab; unsigned int off; - off = obj_to_index(page->slab_cache, page, p); - objcg = page_objcgs(page)[off]; - if (objcg) - return obj_cgroup_memcg(objcg); + slab = folio_slab(folio); + objcgs = slab_objcgs(slab); + if (!objcgs) + return NULL; + + off = obj_to_index(slab->slab_cache, slab, p); + if (objcgs[off]) + return obj_cgroup_memcg(objcgs[off]); return NULL; } /* - * page_memcg_check() is used here, because page_has_obj_cgroups() - * check above could fail because the object cgroups vector wasn't set - * at that moment, but it can be set concurrently. + * page_memcg_check() is used here, because in theory we can encounter + * a folio where the slab flag has been cleared already, but + * slab->memcg_data has not been freed yet * page_memcg_check(page) will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ - return page_memcg_check(page); + return page_memcg_check(folio_page(folio, 0)); } __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 373837bb94cb..14ae5c18e776 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1690,21 +1690,28 @@ int memory_failure(unsigned long pfn, int flags) if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); + mutex_lock(&mf_mutex); + p = pfn_to_online_page(pfn); if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, - pgmap); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); - return -ENXIO; + res = -ENXIO; + goto unlock_mutex; } - mutex_lock(&mf_mutex); - try_again: if (PageHuge(p)) { res = memory_failure_hugetlb(pfn, flags); diff --git a/mm/memory.c b/mm/memory.c index 571d02f419ba..f306e698a1e3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1305,6 +1305,28 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } +/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct address_space *zap_mapping; /* Check page->mapping if set */ + struct folio *single_folio; /* Locked folio to be unmapped */ +}; + +/* + * We set details->zap_mapping when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1444,8 +1466,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ - } else if (details && details->single_page && - PageTransCompound(details->single_page) && + } else if (details && details->single_folio && + folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* @@ -3333,31 +3355,30 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** - * unmap_mapping_page() - Unmap single page from processes. - * @page: The locked page to be unmapped. + * unmap_mapping_folio() - Unmap single folio from processes. + * @folio: The locked folio to be unmapped. * - * Unmap this page from any userspace process which still has it mmaped. + * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once - * truncation or invalidation holds the lock on a page, it may find that - * the page has been remapped again: and then uses unmap_mapping_page() + * truncation or invalidation holds the lock on a folio, it may find that + * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ -void unmap_mapping_page(struct page *page) +void unmap_mapping_folio(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageTail(page)); + VM_BUG_ON(!folio_test_locked(folio)); - first_index = page->index; - last_index = page->index + thp_nr_pages(page) - 1; + first_index = folio->index; + last_index = folio->index + folio_nr_pages(folio) - 1; details.zap_mapping = mapping; - details.single_page = page; + details.single_folio = folio; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) diff --git a/mm/memremap.c b/mm/memremap.c index a2869d8519a2..6aa5f0c2d11f 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -119,30 +119,6 @@ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ pfn = pfn_next(map, pfn)) -static void dev_pagemap_kill(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); -} - -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } - /* - * Undo the pgmap ref assignment for the internal case as the - * caller may re-enable the same pgmap. - */ - if (pgmap->ref == &pgmap->internal_ref) - pgmap->ref = NULL; -} - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@ -174,11 +150,12 @@ void memunmap_pages(struct dev_pagemap *pgmap) unsigned long pfn; int i; - dev_pagemap_kill(pgmap); + percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) for_each_device_pfn(pfn, pgmap, i) put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); + wait_for_completion(&pgmap->done); + percpu_ref_exit(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) pageunmap_range(pgmap, i); @@ -195,8 +172,7 @@ static void devm_memremap_pages_release(void *data) static void dev_pagemap_percpu_release(struct percpu_ref *ref) { - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref); complete(&pgmap->done); } @@ -302,7 +278,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_len(pgmap, range_id)); + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: @@ -368,22 +344,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) break; } - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, + GFP_KERNEL); + if (error) + return ERR_PTR(error); devmap_managed_enable_get(pgmap); @@ -492,7 +457,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) pgmap = NULL; rcu_read_unlock(); diff --git a/mm/migrate.c b/mm/migrate.c index 05af2b2336b9..18ce840914f0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -291,7 +291,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, { pte_t pte; swp_entry_t entry; - struct page *page; + struct folio *folio; spin_lock(ptl); pte = *ptep; @@ -302,18 +302,17 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - page = pfn_swap_entry_to_page(entry); - page = compound_head(page); + folio = page_folio(pfn_swap_entry_to_page(entry)); /* * Once page cache replacement of page migration started, page_count - * is zero; but we must not call put_and_wait_on_page_locked() without - * a ref. Use get_page_unless_zero(), and just fault again if it fails. + * is zero; but we must not call folio_put_wait_locked() without + * a ref. Use folio_try_get(), and just fault again if it fails. */ - if (!get_page_unless_zero(page)) + if (!folio_try_get(folio)) goto out; pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); return; out: pte_unmap_unlock(ptep, ptl); @@ -338,16 +337,16 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; + struct folio *folio; ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); - if (!get_page_unless_zero(page)) + folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd))); + if (!folio_try_get(folio)) goto unlock; spin_unlock(ptl); - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); return; unlock: spin_unlock(ptl); @@ -434,14 +433,6 @@ int folio_migrate_mapping(struct address_space *mapping, } xas_store(&xas, newfolio); - if (nr > 1) { - int i; - - for (i = 1; i < nr; i++) { - xas_next(&xas); - xas_store(&xas, newfolio); - } - } /* * Drop cache reference from old page by unfreezing diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a613f8ef6a02..91d163f8d36b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2496,7 +2496,11 @@ void folio_account_cleaned(struct folio *folio, struct address_space *mapping, * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). + * The caller must hold lock_page_memcg(). Most callers have the folio + * locked. A few have the folio blocked from truncation through other + * means (eg zap_page_range() has it mapped and is holding the page table + * lock). This can also be called from mark_buffer_dirty(), which I + * cannot prove is always protected against truncate. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) diff --git a/mm/readahead.c b/mm/readahead.c index 6ae5693de28c..cf0dcf89eb69 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -196,9 +196,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { - struct page *page = xa_load(&mapping->i_pages, index + i); + struct folio *folio = xa_load(&mapping->i_pages, index + i); - if (page && !xa_is_value(page)) { + if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the @@ -212,21 +212,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) break; if (mapping->a_ops->readpages) { - page->index = index + i; - list_add(&page->lru, &page_pool); - } else if (add_to_page_cache_lru(page, mapping, index + i, + folio->index = index + i; + list_add(&folio->lru, &page_pool); + } else if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { - put_page(page); + folio_put(folio); read_pages(ractl, &page_pool, true); i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) - SetPageReadahead(page); + folio_set_readahead(folio); ractl->_nr_pages++; } @@ -581,7 +581,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, - struct page *page, unsigned long req_count) + struct folio *folio, unsigned long req_count) { /* no read-ahead */ if (!ractl->ra->ra_pages) @@ -590,10 +590,10 @@ void page_cache_async_ra(struct readahead_control *ractl, /* * Same bit is used for PG_readahead and PG_reclaim. */ - if (PageWriteback(page)) + if (folio_test_writeback(folio)) return; - ClearPageReadahead(page); + folio_clear_readahead(folio); /* * Defer asynchronous read-ahead on IO congestion. diff --git a/mm/shmem.c b/mm/shmem.c index 0700e9acf53b..66909efd0a1b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -699,7 +699,6 @@ static int shmem_add_to_page_cache(struct page *page, struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); - unsigned long i = 0; unsigned long nr = compound_nr(page); int error; @@ -726,20 +725,18 @@ static int shmem_add_to_page_cache(struct page *page, cgroup_throttle_swaprate(page, gfp); do { - void *entry; xas_lock_irq(&xas); - entry = xas_find_conflict(&xas); - if (entry != expected) + if (expected != xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + if (expected && xas_find_conflict(&xas)) { xas_set_err(&xas, -EEXIST); - xas_create_range(&xas); - if (xas_error(&xas)) goto unlock; -next: - xas_store(&xas, page); - if (++i < nr) { - xas_next(&xas); - goto next; } + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); @@ -885,30 +882,26 @@ void shmem_unlock_mapping(struct address_space *mapping) } } -/* - * Check whether a hole-punch or truncation needs to split a huge page, - * returning true if no split was required, or the split has been successful. - * - * Eviction (or truncation to 0 size) should never need to split a huge page; - * but in rare cases might do so, if shmem_undo_range() failed to trylock on - * head, and then succeeded to trylock on tail. - * - * A split can only succeed when there are no additional references on the - * huge page: so the split below relies upon find_get_entries() having stopped - * when it found a subpage of the huge page, without getting further references. - */ -static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { - if (!PageTransCompound(page)) - return true; - - /* Just proceed to delete a huge page wholly within the range punched */ - if (PageHead(page) && - page->index >= start && page->index + HPAGE_PMD_NR <= end) - return true; + struct folio *folio; + struct page *page; - /* Try to split huge page, so we can truly punch the hole or truncate */ - return split_huge_page(page) >= 0; + /* + * At first avoid shmem_getpage(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated pages as holes. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_ENTRY | FGP_LOCK, 0); + if (!xa_is_value(folio)) + return folio; + /* + * But read a page back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + page = NULL; + shmem_getpage(inode, index, &page, SGP_READ); + return page ? page_folio(page) : NULL; } /* @@ -922,10 +915,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; - unsigned int partial_start = lstart & (PAGE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; @@ -936,67 +929,64 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + index, folio); continue; } - index += thp_nr_pages(page) - 1; + index += folio_nr_pages(folio) - 1; - if (!unfalloc || !PageUptodate(page)) - truncate_inode_page(mapping, page); - unlock_page(page); + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } - if (partial_start) { - struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - top = partial_end; - partial_end = 0; - } - zero_user_segment(page, partial_start, top); - set_page_dirty(page); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ); - if (page) { - zero_user_segment(page, 0, partial_end); - set_page_dirty(page); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - if (start >= end) - return; index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -1005,14 +995,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, page)) { + if (shmem_free_swap(mapping, index, folio)) { /* Swap was replaced by page: retry */ index--; break; @@ -1021,32 +1011,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } - lock_page(page); + folio_lock(folio); - if (!unfalloc || !PageUptodate(page)) { - if (page_mapping(page) != mapping) { + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ - unlock_page(page); + folio_unlock(folio); index--; break; } - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Wipe the page and don't get stuck */ - clear_highpage(page); - flush_dcache_page(page); - set_page_dirty(page); - if (index < - round_up(start, HPAGE_PMD_NR)) - start = index + 1; - } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + truncate_inode_folio(mapping, folio); } - unlock_page(page); + index = folio->index + folio_nr_pages(folio) - 1; + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); index++; } diff --git a/mm/slab.c b/mm/slab.c index ca4822f6b2b6..ddf5737c63d9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -218,7 +218,7 @@ static void cache_reap(struct work_struct *unused); static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, void **list); static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct page *page, + struct kmem_cache_node *n, struct slab *slab, void **list); static int slab_early_init = 1; @@ -372,10 +372,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, - unsigned int idx) +static inline void *index_to_obj(struct kmem_cache *cache, + const struct slab *slab, unsigned int idx) { - return page->s_mem + cache->size * idx; + return slab->s_mem + cache->size * idx; } #define BOOT_CPUCACHE_ENTRIES 1 @@ -550,17 +550,17 @@ static struct array_cache *alloc_arraycache(int node, int entries, } static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, - struct page *page, void *objp) + struct slab *slab, void *objp) { struct kmem_cache_node *n; - int page_node; + int slab_node; LIST_HEAD(list); - page_node = page_to_nid(page); - n = get_node(cachep, page_node); + slab_node = slab_nid(slab); + n = get_node(cachep, slab_node); spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, page_node, &list); + free_block(cachep, &objp, 1, slab_node, &list); spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); @@ -761,7 +761,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, } static int __cache_free_alien(struct kmem_cache *cachep, void *objp, - int node, int page_node) + int node, int slab_node) { struct kmem_cache_node *n; struct alien_cache *alien = NULL; @@ -770,21 +770,21 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); - if (n->alien && n->alien[page_node]) { - alien = n->alien[page_node]; + if (n->alien && n->alien[slab_node]) { + alien = n->alien[slab_node]; ac = &alien->ac; spin_lock(&alien->lock); if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, page_node, &list); + __drain_alien_cache(cachep, ac, slab_node, &list); } __free_one(ac, objp); spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { - n = get_node(cachep, page_node); + n = get_node(cachep, slab_node); spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, page_node, &list); + free_block(cachep, &objp, 1, slab_node, &list); spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } @@ -793,16 +793,16 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { - int page_node = page_to_nid(virt_to_page(objp)); + int slab_node = slab_nid(virt_to_slab(objp)); int node = numa_mem_id(); /* * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(node == page_node)) + if (likely(node == slab_node)) return 0; - return __cache_free_alien(cachep, objp, node, page_node); + return __cache_free_alien(cachep, objp, node, slab_node); } /* @@ -1367,57 +1367,60 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) * did not request dmaable memory, we might get it, but that * would be relatively rare and ignorable. */ -static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, +static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - struct page *page; + struct folio *folio; + struct slab *slab; flags |= cachep->allocflags; - page = __alloc_pages_node(nodeid, flags, cachep->gfporder); - if (!page) { + folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder); + if (!folio) { slab_out_of_memory(cachep, flags, nodeid); return NULL; } - account_slab_page(page, cachep->gfporder, cachep, flags); - __SetPageSlab(page); + slab = folio_slab(folio); + + account_slab(slab, cachep->gfporder, cachep, flags); + __folio_set_slab(folio); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (sk_memalloc_socks() && page_is_pfmemalloc(page)) - SetPageSlabPfmemalloc(page); + if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) + slab_set_pfmemalloc(slab); - return page; + return slab; } /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, struct page *page) +static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) { int order = cachep->gfporder; + struct folio *folio = slab_folio(slab); - BUG_ON(!PageSlab(page)); - __ClearPageSlabPfmemalloc(page); - __ClearPageSlab(page); - page_mapcount_reset(page); - /* In union with page->mapping where page allocator expects NULL */ - page->slab_cache = NULL; + BUG_ON(!folio_test_slab(folio)); + __slab_clear_pfmemalloc(slab); + __folio_clear_slab(folio); + page_mapcount_reset(folio_page(folio, 0)); + folio->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; - unaccount_slab_page(page, order, cachep); - __free_pages(page, order); + unaccount_slab(slab, order, cachep); + __free_pages(folio_page(folio, 0), order); } static void kmem_rcu_free(struct rcu_head *head) { struct kmem_cache *cachep; - struct page *page; + struct slab *slab; - page = container_of(head, struct page, rcu_head); - cachep = page->slab_cache; + slab = container_of(head, struct slab, rcu_head); + cachep = slab->slab_cache; - kmem_freepages(cachep, page); + kmem_freepages(cachep, slab); } #if DEBUG @@ -1553,18 +1556,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct page *page = virt_to_head_page(objp); + struct slab *slab = virt_to_slab(objp); unsigned int objnr; - objnr = obj_to_index(cachep, page, objp); + objnr = obj_to_index(cachep, slab, objp); if (objnr) { - objp = index_to_obj(cachep, page, objnr - 1); + objp = index_to_obj(cachep, slab, objnr - 1); realobj = (char *)objp + obj_offset(cachep); pr_err("Prev obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, page, objnr + 1); + objp = index_to_obj(cachep, slab, objnr + 1); realobj = (char *)objp + obj_offset(cachep); pr_err("Next obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); @@ -1575,17 +1578,17 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #if DEBUG static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct page *page) + struct slab *slab) { int i; if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { - poison_obj(cachep, page->freelist - obj_offset(cachep), + poison_obj(cachep, slab->freelist - obj_offset(cachep), POISON_FREE); } for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, page, i); + void *objp = index_to_obj(cachep, slab, i); if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); @@ -1601,7 +1604,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, } #else static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct page *page) + struct slab *slab) { } #endif @@ -1609,22 +1612,22 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed - * @page: page pointer being destroyed + * @slab: slab being destroyed * - * Destroy all the objs in a slab page, and release the mem back to the system. - * Before calling the slab page must have been unlinked from the cache. The + * Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. The * kmem_cache_node ->list_lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct page *page) +static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) { void *freelist; - freelist = page->freelist; - slab_destroy_debugcheck(cachep, page); + freelist = slab->freelist; + slab_destroy_debugcheck(cachep, slab); if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) - call_rcu(&page->rcu_head, kmem_rcu_free); + call_rcu(&slab->rcu_head, kmem_rcu_free); else - kmem_freepages(cachep, page); + kmem_freepages(cachep, slab); /* * From now on, we don't use freelist @@ -1640,11 +1643,11 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) */ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) { - struct page *page, *n; + struct slab *slab, *n; - list_for_each_entry_safe(page, n, list, slab_list) { - list_del(&page->slab_list); - slab_destroy(cachep, page); + list_for_each_entry_safe(slab, n, list, slab_list) { + list_del(&slab->slab_list); + slab_destroy(cachep, slab); } } @@ -2194,7 +2197,7 @@ static int drain_freelist(struct kmem_cache *cache, { struct list_head *p; int nr_freed; - struct page *page; + struct slab *slab; nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { @@ -2206,8 +2209,8 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - page = list_entry(p, struct page, slab_list); - list_del(&page->slab_list); + slab = list_entry(p, struct slab, slab_list); + list_del(&slab->slab_list); n->free_slabs--; n->total_slabs--; /* @@ -2216,7 +2219,7 @@ static int drain_freelist(struct kmem_cache *cache, */ n->free_objects -= cache->num; spin_unlock_irq(&n->list_lock); - slab_destroy(cache, page); + slab_destroy(cache, slab); nr_freed++; } out: @@ -2291,14 +2294,14 @@ void __kmem_cache_release(struct kmem_cache *cachep) * which are all initialized during kmem_cache_init(). */ static void *alloc_slabmgmt(struct kmem_cache *cachep, - struct page *page, int colour_off, + struct slab *slab, int colour_off, gfp_t local_flags, int nodeid) { void *freelist; - void *addr = page_address(page); + void *addr = slab_address(slab); - page->s_mem = addr + colour_off; - page->active = 0; + slab->s_mem = addr + colour_off; + slab->active = 0; if (OBJFREELIST_SLAB(cachep)) freelist = NULL; @@ -2315,24 +2318,24 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) +static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx) { - return ((freelist_idx_t *)page->freelist)[idx]; + return ((freelist_idx_t *) slab->freelist)[idx]; } -static inline void set_free_obj(struct page *page, +static inline void set_free_obj(struct slab *slab, unsigned int idx, freelist_idx_t val) { - ((freelist_idx_t *)(page->freelist))[idx] = val; + ((freelist_idx_t *)(slab->freelist))[idx] = val; } -static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab) { #if DEBUG int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, page, i); + void *objp = index_to_obj(cachep, slab, i); if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2416,17 +2419,17 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state) } /* Swap two freelist entries */ -static void swap_free_obj(struct page *page, unsigned int a, unsigned int b) +static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b) { - swap(((freelist_idx_t *)page->freelist)[a], - ((freelist_idx_t *)page->freelist)[b]); + swap(((freelist_idx_t *) slab->freelist)[a], + ((freelist_idx_t *) slab->freelist)[b]); } /* * Shuffle the freelist initialization state based on pre-computed lists. * return true if the list was successfully shuffled, false otherwise. */ -static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) +static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab) { unsigned int objfreelist = 0, i, rand, count = cachep->num; union freelist_init_state state; @@ -2443,7 +2446,7 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) objfreelist = count - 1; else objfreelist = next_random_slot(&state); - page->freelist = index_to_obj(cachep, page, objfreelist) + + slab->freelist = index_to_obj(cachep, slab, objfreelist) + obj_offset(cachep); count--; } @@ -2454,51 +2457,51 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) */ if (!precomputed) { for (i = 0; i < count; i++) - set_free_obj(page, i, i); + set_free_obj(slab, i, i); /* Fisher-Yates shuffle */ for (i = count - 1; i > 0; i--) { rand = prandom_u32_state(&state.rnd_state); rand %= (i + 1); - swap_free_obj(page, i, rand); + swap_free_obj(slab, i, rand); } } else { for (i = 0; i < count; i++) - set_free_obj(page, i, next_random_slot(&state)); + set_free_obj(slab, i, next_random_slot(&state)); } if (OBJFREELIST_SLAB(cachep)) - set_free_obj(page, cachep->num - 1, objfreelist); + set_free_obj(slab, cachep->num - 1, objfreelist); return true; } #else static inline bool shuffle_freelist(struct kmem_cache *cachep, - struct page *page) + struct slab *slab) { return false; } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) + struct slab *slab) { int i; void *objp; bool shuffled; - cache_init_objs_debug(cachep, page); + cache_init_objs_debug(cachep, slab); /* Try to randomize the freelist if enabled */ - shuffled = shuffle_freelist(cachep, page); + shuffled = shuffle_freelist(cachep, slab); if (!shuffled && OBJFREELIST_SLAB(cachep)) { - page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) + obj_offset(cachep); } for (i = 0; i < cachep->num; i++) { - objp = index_to_obj(cachep, page, i); + objp = index_to_obj(cachep, slab, i); objp = kasan_init_slab_obj(cachep, objp); /* constructor could break poison info */ @@ -2509,68 +2512,56 @@ static void cache_init_objs(struct kmem_cache *cachep, } if (!shuffled) - set_free_obj(page, i, i); + set_free_obj(slab, i, i); } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab) { void *objp; - objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); - page->active++; + objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active)); + slab->active++; return objp; } static void slab_put_obj(struct kmem_cache *cachep, - struct page *page, void *objp) + struct slab *slab, void *objp) { - unsigned int objnr = obj_to_index(cachep, page, objp); + unsigned int objnr = obj_to_index(cachep, slab, objp); #if DEBUG unsigned int i; /* Verify double free bug */ - for (i = page->active; i < cachep->num; i++) { - if (get_free_obj(page, i) == objnr) { + for (i = slab->active; i < cachep->num; i++) { + if (get_free_obj(slab, i) == objnr) { pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); BUG(); } } #endif - page->active--; - if (!page->freelist) - page->freelist = objp + obj_offset(cachep); - - set_free_obj(page, page->active, objnr); -} + slab->active--; + if (!slab->freelist) + slab->freelist = objp + obj_offset(cachep); -/* - * Map pages beginning at addr to the given cache and slab. This is required - * for the slab allocator to be able to lookup the cache and slab of a - * virtual address for kfree, ksize, and slab debugging. - */ -static void slab_map_pages(struct kmem_cache *cache, struct page *page, - void *freelist) -{ - page->slab_cache = cache; - page->freelist = freelist; + set_free_obj(slab, slab->active, objnr); } /* * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static struct page *cache_grow_begin(struct kmem_cache *cachep, +static struct slab *cache_grow_begin(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *freelist; size_t offset; gfp_t local_flags; - int page_node; + int slab_node; struct kmem_cache_node *n; - struct page *page; + struct slab *slab; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2590,12 +2581,12 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - page = kmem_getpages(cachep, local_flags, nodeid); - if (!page) + slab = kmem_getpages(cachep, local_flags, nodeid); + if (!slab) goto failed; - page_node = page_to_nid(page); - n = get_node(cachep, page_node); + slab_node = slab_nid(slab); + n = get_node(cachep, slab_node); /* Get colour for the slab, and cal the next value. */ n->colour_next++; @@ -2613,54 +2604,55 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * page_address() in the latter returns a non-tagged pointer, * as it should be for slab pages. */ - kasan_poison_slab(page); + kasan_poison_slab(slab); /* Get slab management. */ - freelist = alloc_slabmgmt(cachep, page, offset, - local_flags & ~GFP_CONSTRAINT_MASK, page_node); + freelist = alloc_slabmgmt(cachep, slab, offset, + local_flags & ~GFP_CONSTRAINT_MASK, slab_node); if (OFF_SLAB(cachep) && !freelist) goto opps1; - slab_map_pages(cachep, page, freelist); + slab->slab_cache = cachep; + slab->freelist = freelist; - cache_init_objs(cachep, page); + cache_init_objs(cachep, slab); if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return page; + return slab; opps1: - kmem_freepages(cachep, page); + kmem_freepages(cachep, slab); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return NULL; } -static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) { struct kmem_cache_node *n; void *list = NULL; check_irq_off(); - if (!page) + if (!slab) return; - INIT_LIST_HEAD(&page->slab_list); - n = get_node(cachep, page_to_nid(page)); + INIT_LIST_HEAD(&slab->slab_list); + n = get_node(cachep, slab_nid(slab)); spin_lock(&n->list_lock); n->total_slabs++; - if (!page->active) { - list_add_tail(&page->slab_list, &n->slabs_free); + if (!slab->active) { + list_add_tail(&slab->slab_list, &n->slabs_free); n->free_slabs++; } else - fixup_slab_list(cachep, n, page, &list); + fixup_slab_list(cachep, n, slab, &list); STATS_INC_GROWN(cachep); - n->free_objects += cachep->num - page->active; + n->free_objects += cachep->num - slab->active; spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); @@ -2708,13 +2700,13 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned long caller) { unsigned int objnr; - struct page *page; + struct slab *slab; BUG_ON(virt_to_cache(objp) != cachep); objp -= obj_offset(cachep); kfree_debugcheck(objp); - page = virt_to_head_page(objp); + slab = virt_to_slab(objp); if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); @@ -2724,10 +2716,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = (void *)caller; - objnr = obj_to_index(cachep, page, objp); + objnr = obj_to_index(cachep, slab, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, page, objnr)); + BUG_ON(objp != index_to_obj(cachep, slab, objnr)); if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); @@ -2757,97 +2749,97 @@ static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, } static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct page *page, + struct kmem_cache_node *n, struct slab *slab, void **list) { /* move slabp to correct slabp list: */ - list_del(&page->slab_list); - if (page->active == cachep->num) { - list_add(&page->slab_list, &n->slabs_full); + list_del(&slab->slab_list); + if (slab->active == cachep->num) { + list_add(&slab->slab_list, &n->slabs_full); if (OBJFREELIST_SLAB(cachep)) { #if DEBUG /* Poisoning will be done without holding the lock */ if (cachep->flags & SLAB_POISON) { - void **objp = page->freelist; + void **objp = slab->freelist; *objp = *list; *list = objp; } #endif - page->freelist = NULL; + slab->freelist = NULL; } } else - list_add(&page->slab_list, &n->slabs_partial); + list_add(&slab->slab_list, &n->slabs_partial); } /* Try to find non-pfmemalloc slab if needed */ -static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, - struct page *page, bool pfmemalloc) +static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n, + struct slab *slab, bool pfmemalloc) { - if (!page) + if (!slab) return NULL; if (pfmemalloc) - return page; + return slab; - if (!PageSlabPfmemalloc(page)) - return page; + if (!slab_test_pfmemalloc(slab)) + return slab; /* No need to keep pfmemalloc slab if we have enough free objects */ if (n->free_objects > n->free_limit) { - ClearPageSlabPfmemalloc(page); - return page; + slab_clear_pfmemalloc(slab); + return slab; } /* Move pfmemalloc slab to the end of list to speed up next search */ - list_del(&page->slab_list); - if (!page->active) { - list_add_tail(&page->slab_list, &n->slabs_free); + list_del(&slab->slab_list); + if (!slab->active) { + list_add_tail(&slab->slab_list, &n->slabs_free); n->free_slabs++; } else - list_add_tail(&page->slab_list, &n->slabs_partial); + list_add_tail(&slab->slab_list, &n->slabs_partial); - list_for_each_entry(page, &n->slabs_partial, slab_list) { - if (!PageSlabPfmemalloc(page)) - return page; + list_for_each_entry(slab, &n->slabs_partial, slab_list) { + if (!slab_test_pfmemalloc(slab)) + return slab; } n->free_touched = 1; - list_for_each_entry(page, &n->slabs_free, slab_list) { - if (!PageSlabPfmemalloc(page)) { + list_for_each_entry(slab, &n->slabs_free, slab_list) { + if (!slab_test_pfmemalloc(slab)) { n->free_slabs--; - return page; + return slab; } } return NULL; } -static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) +static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { - struct page *page; + struct slab *slab; assert_spin_locked(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, struct page, + slab = list_first_entry_or_null(&n->slabs_partial, struct slab, slab_list); - if (!page) { + if (!slab) { n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, struct page, + slab = list_first_entry_or_null(&n->slabs_free, struct slab, slab_list); - if (page) + if (slab) n->free_slabs--; } if (sk_memalloc_socks()) - page = get_valid_first_slab(n, page, pfmemalloc); + slab = get_valid_first_slab(n, slab, pfmemalloc); - return page; + return slab; } static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, struct kmem_cache_node *n, gfp_t flags) { - struct page *page; + struct slab *slab; void *obj; void *list = NULL; @@ -2855,16 +2847,16 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, return NULL; spin_lock(&n->list_lock); - page = get_first_slab(n, true); - if (!page) { + slab = get_first_slab(n, true); + if (!slab) { spin_unlock(&n->list_lock); return NULL; } - obj = slab_get_obj(cachep, page); + obj = slab_get_obj(cachep, slab); n->free_objects--; - fixup_slab_list(cachep, n, page, &list); + fixup_slab_list(cachep, n, slab, &list); spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); @@ -2877,20 +2869,20 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, * or cache_grow_end() for new slab */ static __always_inline int alloc_block(struct kmem_cache *cachep, - struct array_cache *ac, struct page *page, int batchcount) + struct array_cache *ac, struct slab *slab, int batchcount) { /* * There must be at least one object available for * allocation. */ - BUG_ON(page->active >= cachep->num); + BUG_ON(slab->active >= cachep->num); - while (page->active < cachep->num && batchcount--) { + while (slab->active < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, page); + ac->entry[ac->avail++] = slab_get_obj(cachep, slab); } return batchcount; @@ -2903,7 +2895,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac, *shared; int node; void *list = NULL; - struct page *page; + struct slab *slab; check_irq_off(); node = numa_mem_id(); @@ -2936,14 +2928,14 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) while (batchcount > 0) { /* Get slab alloc is to come from. */ - page = get_first_slab(n, false); - if (!page) + slab = get_first_slab(n, false); + if (!slab) goto must_grow; check_spinlock_acquired(cachep); - batchcount = alloc_block(cachep, ac, page, batchcount); - fixup_slab_list(cachep, n, page, &list); + batchcount = alloc_block(cachep, ac, slab, batchcount); + fixup_slab_list(cachep, n, slab, &list); } must_grow: @@ -2962,16 +2954,16 @@ direct_grow: return obj; } - page = cache_grow_begin(cachep, gfp_exact_node(flags), node); + slab = cache_grow_begin(cachep, gfp_exact_node(flags), node); /* * cache_grow_begin() can reenable interrupts, * then ac could change. */ ac = cpu_cache_get(cachep); - if (!ac->avail && page) - alloc_block(cachep, ac, page, batchcount); - cache_grow_end(cachep, page); + if (!ac->avail && slab) + alloc_block(cachep, ac, slab, batchcount); + cache_grow_end(cachep, slab); if (!ac->avail) return NULL; @@ -3101,7 +3093,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; - struct page *page; + struct slab *slab; int nid; unsigned int cpuset_mems_cookie; @@ -3137,10 +3129,10 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - page = cache_grow_begin(cache, flags, numa_mem_id()); - cache_grow_end(cache, page); - if (page) { - nid = page_to_nid(page); + slab = cache_grow_begin(cache, flags, numa_mem_id()); + cache_grow_end(cache, slab); + if (slab) { + nid = slab_nid(slab); obj = ____cache_alloc_node(cache, gfp_exact_node(flags), nid); @@ -3164,7 +3156,7 @@ retry: static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - struct page *page; + struct slab *slab; struct kmem_cache_node *n; void *obj = NULL; void *list = NULL; @@ -3175,8 +3167,8 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, check_irq_off(); spin_lock(&n->list_lock); - page = get_first_slab(n, false); - if (!page) + slab = get_first_slab(n, false); + if (!slab) goto must_grow; check_spinlock_acquired_node(cachep, nodeid); @@ -3185,12 +3177,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(page->active == cachep->num); + BUG_ON(slab->active == cachep->num); - obj = slab_get_obj(cachep, page); + obj = slab_get_obj(cachep, slab); n->free_objects--; - fixup_slab_list(cachep, n, page, &list); + fixup_slab_list(cachep, n, slab, &list); spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); @@ -3198,12 +3190,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, must_grow: spin_unlock(&n->list_lock); - page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); - if (page) { + slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (slab) { /* This slab isn't counted yet so don't update free_objects */ - obj = slab_get_obj(cachep, page); + obj = slab_get_obj(cachep, slab); } - cache_grow_end(cachep, page); + cache_grow_end(cachep, slab); return obj ? obj : fallback_alloc(cachep, flags); } @@ -3333,40 +3325,40 @@ static void free_block(struct kmem_cache *cachep, void **objpp, { int i; struct kmem_cache_node *n = get_node(cachep, node); - struct page *page; + struct slab *slab; n->free_objects += nr_objects; for (i = 0; i < nr_objects; i++) { void *objp; - struct page *page; + struct slab *slab; objp = objpp[i]; - page = virt_to_head_page(objp); - list_del(&page->slab_list); + slab = virt_to_slab(objp); + list_del(&slab->slab_list); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp); + slab_put_obj(cachep, slab, objp); STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (page->active == 0) { - list_add(&page->slab_list, &n->slabs_free); + if (slab->active == 0) { + list_add(&slab->slab_list, &n->slabs_free); n->free_slabs++; } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&page->slab_list, &n->slabs_partial); + list_add_tail(&slab->slab_list, &n->slabs_partial); } } while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { n->free_objects -= cachep->num; - page = list_last_entry(&n->slabs_free, struct page, slab_list); - list_move(&page->slab_list, list); + slab = list_last_entry(&n->slabs_free, struct slab, slab_list); + list_move(&slab->slab_list, list); n->free_slabs--; n->total_slabs--; } @@ -3402,10 +3394,10 @@ free_done: #if STATS { int i = 0; - struct page *page; + struct slab *slab; - list_for_each_entry(page, &n->slabs_free, slab_list) { - BUG_ON(page->active); + list_for_each_entry(slab, &n->slabs_free, slab_list) { + BUG_ON(slab->active); i++; } @@ -3481,10 +3473,10 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, } if (sk_memalloc_socks()) { - struct page *page = virt_to_head_page(objp); + struct slab *slab = virt_to_slab(objp); - if (unlikely(PageSlabPfmemalloc(page))) { - cache_free_pfmemalloc(cachep, page, objp); + if (unlikely(slab_test_pfmemalloc(slab))) { + cache_free_pfmemalloc(cachep, slab, objp); return; } } @@ -3657,21 +3649,21 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { struct kmem_cache *cachep; unsigned int objnr; void *objp; kpp->kp_ptr = object; - kpp->kp_page = page; - cachep = page->slab_cache; + kpp->kp_slab = slab; + cachep = slab->slab_cache; kpp->kp_slab_cache = cachep; objp = object - obj_offset(cachep); kpp->kp_data_offset = obj_offset(cachep); - page = virt_to_head_page(objp); - objnr = obj_to_index(cachep, page, objp); - objp = index_to_obj(cachep, page, objnr); + slab = virt_to_slab(objp); + objnr = obj_to_index(cachep, slab, objp); + objp = index_to_obj(cachep, slab, objnr); kpp->kp_objp = objp; if (DEBUG && cachep->flags & SLAB_STORE_USER) kpp->kp_ret = *dbg_userword(cachep, objp); @@ -4177,8 +4169,8 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, * Returns NULL if check passes, otherwise const char * to name of cache * to indicate an error. */ -void __check_heap_object(const void *ptr, unsigned long n, struct page *page, - bool to_user) +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) { struct kmem_cache *cachep; unsigned int objnr; @@ -4187,15 +4179,15 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, ptr = kasan_reset_tag(ptr); /* Find and validate object. */ - cachep = page->slab_cache; - objnr = obj_to_index(cachep, page, (void *)ptr); + cachep = slab->slab_cache; + objnr = obj_to_index(cachep, slab, (void *)ptr); BUG_ON(objnr >= cachep->num); /* Find offset within object. */ if (is_kfence_address(ptr)) offset = ptr - kfence_object_start(ptr); else - offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep); /* Allow address range falling entirely within usercopy region. */ if (offset >= cachep->useroffset && diff --git a/mm/slab.h b/mm/slab.h index 053eefaf6cbd..7edb7d23f141 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -5,6 +5,197 @@ * Internal slab definitions */ +/* Reuses the bits in struct page */ +struct slab { + unsigned long __page_flags; + +#if defined(CONFIG_SLAB) + + union { + struct list_head slab_list; + struct rcu_head rcu_head; + }; + struct kmem_cache *slab_cache; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + unsigned int active; + +#elif defined(CONFIG_SLUB) + + union { + struct list_head slab_list; + struct rcu_head rcu_head; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + struct kmem_cache *slab_cache; + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + unsigned int __unused; + +#elif defined(CONFIG_SLOB) + + struct list_head slab_list; + void *__unused_1; + void *freelist; /* first free block */ + long units; + unsigned int __unused_2; + +#else +#error "Unexpected slab allocator configured" +#endif + + atomic_t __page_refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif +}; + +#define SLAB_MATCH(pg, sl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) +SLAB_MATCH(flags, __page_flags); +SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ +SLAB_MATCH(slab_list, slab_list); +#ifndef CONFIG_SLOB +SLAB_MATCH(rcu_head, rcu_head); +SLAB_MATCH(slab_cache, slab_cache); +#endif +#ifdef CONFIG_SLAB +SLAB_MATCH(s_mem, s_mem); +SLAB_MATCH(active, active); +#endif +SLAB_MATCH(_refcount, __page_refcount); +#ifdef CONFIG_MEMCG +SLAB_MATCH(memcg_data, memcg_data); +#endif +#undef SLAB_MATCH +static_assert(sizeof(struct slab) <= sizeof(struct page)); + +/** + * folio_slab - Converts from folio to slab. + * @folio: The folio. + * + * Currently struct slab is a different representation of a folio where + * folio_test_slab() is true. + * + * Return: The slab which contains this folio. + */ +#define folio_slab(folio) (_Generic((folio), \ + const struct folio *: (const struct slab *)(folio), \ + struct folio *: (struct slab *)(folio))) + +/** + * slab_folio - The folio allocated for a slab + * @slab: The slab. + * + * Slabs are allocated as folios that contain the individual objects and are + * using some fields in the first struct page of the folio - those fields are + * now accessed by struct slab. It is occasionally necessary to convert back to + * a folio in order to communicate with the rest of the mm. Please use this + * helper function instead of casting yourself, as the implementation may change + * in the future. + */ +#define slab_folio(s) (_Generic((s), \ + const struct slab *: (const struct folio *)s, \ + struct slab *: (struct folio *)s)) + +/** + * page_slab - Converts from first struct page to slab. + * @p: The first (either head of compound or single) page of slab. + * + * A temporary wrapper to convert struct page to struct slab in situations where + * we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct slab directly or go + * through folio to struct slab. + * + * Return: The slab which contains this page + */ +#define page_slab(p) (_Generic((p), \ + const struct page *: (const struct slab *)(p), \ + struct page *: (struct slab *)(p))) + +/** + * slab_page - The first struct page allocated for a slab + * @slab: The slab. + * + * A convenience wrapper for converting slab to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct slab. + */ +#define slab_page(s) folio_page(slab_folio(s), 0) + +/* + * If network-based swap is enabled, sl*b must keep track of whether pages + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return folio_test_active((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + folio_set_active(slab_folio(slab)); +} + +static inline void slab_clear_pfmemalloc(struct slab *slab) +{ + folio_clear_active(slab_folio(slab)); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __folio_clear_active(slab_folio(slab)); +} + +static inline void *slab_address(const struct slab *slab) +{ + return folio_address(slab_folio(slab)); +} + +static inline int slab_nid(const struct slab *slab) +{ + return folio_nid(slab_folio(slab)); +} + +static inline pg_data_t *slab_pgdat(const struct slab *slab) +{ + return folio_pgdat(slab_folio(slab)); +} + +static inline struct slab *virt_to_slab(const void *addr) +{ + struct folio *folio = virt_to_folio(addr); + + if (!folio_test_slab(folio)) + return NULL; + + return folio_slab(folio); +} + +static inline int slab_order(const struct slab *slab) +{ + return folio_order((struct folio *)slab_folio(slab)); +} + +static inline size_t slab_size(const struct slab *slab) +{ + return PAGE_SIZE << slab_order(slab); +} + #ifdef CONFIG_SLOB /* * Common fields provided in kmem_cache by all slab allocators @@ -245,15 +436,33 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla } #ifdef CONFIG_MEMCG_KMEM -int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp, bool new_page); +/* + * slab_objcgs - get the object cgroups vector associated with a slab + * @slab: a pointer to the slab struct + * + * Returns a pointer to the object cgroups vector associated with the slab, + * or NULL if no such vector has been associated yet. + */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + unsigned long memcg_data = READ_ONCE(slab->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), + slab_page(slab)); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr); -static inline void memcg_free_page_obj_cgroups(struct page *page) +static inline void memcg_free_slab_cgroups(struct slab *slab) { - kfree(page_objcgs(page)); - page->memcg_data = 0; + kfree(slab_objcgs(slab)); + slab->memcg_data = 0; } static inline size_t obj_full_size(struct kmem_cache *s) @@ -298,7 +507,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - struct page *page; + struct slab *slab; unsigned long off; size_t i; @@ -307,19 +516,19 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, for (i = 0; i < size; i++) { if (likely(p[i])) { - page = virt_to_head_page(p[i]); + slab = virt_to_slab(p[i]); - if (!page_objcgs(page) && - memcg_alloc_page_obj_cgroups(page, s, flags, + if (!slab_objcgs(slab) && + memcg_alloc_slab_cgroups(slab, s, flags, false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } - off = obj_to_index(s, page, p[i]); + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); - page_objcgs(page)[off] = objcg; - mod_objcg_state(objcg, page_pgdat(page), + slab_objcgs(slab)[off] = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), obj_full_size(s)); } else { obj_cgroup_uncharge(objcg, obj_full_size(s)); @@ -334,7 +543,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, struct kmem_cache *s; struct obj_cgroup **objcgs; struct obj_cgroup *objcg; - struct page *page; + struct slab *slab; unsigned int off; int i; @@ -345,43 +554,52 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, if (unlikely(!p[i])) continue; - page = virt_to_head_page(p[i]); - objcgs = page_objcgs_check(page); + slab = virt_to_slab(p[i]); + /* we could be given a kmalloc_large() object, skip those */ + if (!slab) + continue; + + objcgs = slab_objcgs(slab); if (!objcgs) continue; if (!s_orig) - s = page->slab_cache; + s = slab->slab_cache; else s = s_orig; - off = obj_to_index(s, page, p[i]); + off = obj_to_index(s, slab, p[i]); objcg = objcgs[off]; if (!objcg) continue; objcgs[off] = NULL; obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), -obj_full_size(s)); obj_cgroup_put(objcg); } } #else /* CONFIG_MEMCG_KMEM */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + return NULL; +} + static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { return NULL; } -static inline int memcg_alloc_page_obj_cgroups(struct page *page, +static inline int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, - bool new_page) + bool new_slab) { return 0; } -static inline void memcg_free_page_obj_cgroups(struct page *page) +static inline void memcg_free_slab_cgroups(struct slab *slab) { } @@ -405,35 +623,35 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, } #endif /* CONFIG_MEMCG_KMEM */ +#ifndef CONFIG_SLOB static inline struct kmem_cache *virt_to_cache(const void *obj) { - struct page *page; + struct slab *slab; - page = virt_to_head_page(obj); - if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) return NULL; - return page->slab_cache; + return slab->slab_cache; } -static __always_inline void account_slab_page(struct page *page, int order, - struct kmem_cache *s, - gfp_t gfp) +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_page_obj_cgroups(page, s, gfp, true); + memcg_alloc_slab_cgroups(slab, s, gfp, true); - mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); } -static __always_inline void unaccount_slab_page(struct page *page, int order, - struct kmem_cache *s) +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) { if (memcg_kmem_enabled()) - memcg_free_page_obj_cgroups(page); + memcg_free_slab_cgroups(slab); - mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); } @@ -452,6 +670,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } +#endif /* CONFIG_SLOB */ static inline size_t slab_ksize(const struct kmem_cache *s) { @@ -630,7 +849,7 @@ static inline void debugfs_slab_release(struct kmem_cache *s) { } #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; - struct page *kp_page; + struct slab *kp_slab; void *kp_objp; unsigned long kp_data_offset; struct kmem_cache *kp_slab_cache; @@ -638,7 +857,18 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); +#endif + +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user); +#else +static inline +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) +{ +} #endif #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 9513244457e6..23f2ab0713b7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -545,13 +545,13 @@ bool slab_is_available(void) */ bool kmem_valid_obj(void *object) { - struct page *page; + struct folio *folio; /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) return false; - page = virt_to_head_page(object); - return PageSlab(page); + folio = virt_to_folio(object); + return folio_test_slab(folio); } EXPORT_SYMBOL_GPL(kmem_valid_obj); @@ -574,18 +574,18 @@ void kmem_dump_obj(void *object) { char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; int i; - struct page *page; + struct slab *slab; unsigned long ptroffset; struct kmem_obj_info kp = { }; if (WARN_ON_ONCE(!virt_addr_valid(object))) return; - page = virt_to_head_page(object); - if (WARN_ON_ONCE(!PageSlab(page))) { + slab = virt_to_slab(object); + if (WARN_ON_ONCE(!slab)) { pr_cont(" non-slab memory.\n"); return; } - kmem_obj_info(&kp, object, page); + kmem_obj_info(&kp, object, slab); if (kp.kp_slab_cache) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); else diff --git a/mm/slob.c b/mm/slob.c index 03deee1e6a94..60c5842215f1 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -30,7 +30,7 @@ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls * alloc_pages() directly, allocating compound pages so the page order * does not have to be separately tracked. - * These objects are detected in kfree() because PageSlab() + * These objects are detected in kfree() because folio_test_slab() * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and @@ -105,21 +105,21 @@ static LIST_HEAD(free_slob_large); /* * slob_page_free: true for pages on free_slob_pages list. */ -static inline int slob_page_free(struct page *sp) +static inline int slob_page_free(struct slab *slab) { - return PageSlobFree(sp); + return PageSlobFree(slab_page(slab)); } -static void set_slob_page_free(struct page *sp, struct list_head *list) +static void set_slob_page_free(struct slab *slab, struct list_head *list) { - list_add(&sp->slab_list, list); - __SetPageSlobFree(sp); + list_add(&slab->slab_list, list); + __SetPageSlobFree(slab_page(slab)); } -static inline void clear_slob_page_free(struct page *sp) +static inline void clear_slob_page_free(struct slab *slab) { - list_del(&sp->slab_list); - __ClearPageSlobFree(sp); + list_del(&slab->slab_list); + __ClearPageSlobFree(slab_page(slab)); } #define SLOB_UNIT sizeof(slob_t) @@ -234,7 +234,7 @@ static void slob_free_pages(void *b, int order) * freelist, in this case @page_removed_from_list will be set to * true (set to false otherwise). */ -static void *slob_page_alloc(struct page *sp, size_t size, int align, +static void *slob_page_alloc(struct slab *sp, size_t size, int align, int align_offset, bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; @@ -301,7 +301,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, int align_offset) { - struct page *sp; + struct folio *folio; + struct slab *sp; struct list_head *slob_list; slob_t *b = NULL; unsigned long flags; @@ -323,7 +324,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, * If there's a node specification, search for a partial * page with a matching node id in the freelist. */ - if (node != NUMA_NO_NODE && page_to_nid(sp) != node) + if (node != NUMA_NO_NODE && slab_nid(sp) != node) continue; #endif /* Enough room on this page? */ @@ -358,8 +359,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) return NULL; - sp = virt_to_page(b); - __SetPageSlab(sp); + folio = virt_to_folio(b); + __folio_set_slab(folio); + sp = folio_slab(folio); spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); @@ -381,7 +383,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, */ static void slob_free(void *block, int size) { - struct page *sp; + struct slab *sp; slob_t *prev, *next, *b = (slob_t *)block; slobidx_t units; unsigned long flags; @@ -391,7 +393,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = virt_to_page(block); + sp = virt_to_slab(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -401,8 +403,7 @@ static void slob_free(void *block, int size) if (slob_page_free(sp)) clear_slob_page_free(sp); spin_unlock_irqrestore(&slob_lock, flags); - __ClearPageSlab(sp); - page_mapcount_reset(sp); + __folio_clear_slab(slab_folio(sp)); slob_free_pages(b, 0); return; } @@ -462,10 +463,10 @@ out: } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { kpp->kp_ptr = object; - kpp->kp_page = page; + kpp->kp_slab = slab; } #endif @@ -544,7 +545,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); void kfree(const void *block) { - struct page *sp; + struct folio *sp; trace_kfree(_RET_IP_, block); @@ -552,16 +553,17 @@ void kfree(const void *block) return; kmemleak_free(block); - sp = virt_to_page(block); - if (PageSlab(sp)) { + sp = virt_to_folio(block); + if (folio_test_slab(sp)) { int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else { - unsigned int order = compound_order(sp); - mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, + unsigned int order = folio_order(sp); + + mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); - __free_pages(sp, order); + __free_pages(folio_page(sp, 0), order); } } @@ -570,7 +572,7 @@ EXPORT_SYMBOL(kfree); /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t __ksize(const void *block) { - struct page *sp; + struct folio *folio; int align; unsigned int *m; @@ -578,9 +580,9 @@ size_t __ksize(const void *block) if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = virt_to_page(block); - if (unlikely(!PageSlab(sp))) - return page_size(sp); + folio = virt_to_folio(block); + if (unlikely(!folio_test_slab(folio))) + return folio_size(folio); align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); m = (unsigned int *)(block - align); diff --git a/mm/slub.c b/mm/slub.c index abe7db581d68..261474092e43 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -48,7 +48,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(page) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches or for debugging) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -64,19 +64,19 @@ * * The slab_lock is only used for debugging and on arches that do not * have the ability to do a cmpxchg_double. It only protects: - * A. page->freelist -> List of object free in a page - * B. page->inuse -> Number of objects in use - * C. page->objects -> Number of objects in page - * D. page->frozen -> frozen state + * A. slab->freelist -> List of free objects in a slab + * B. slab->inuse -> Number of objects in use + * C. slab->objects -> Number of objects in slab + * D. slab->frozen -> frozen state * * Frozen slabs * * If a slab is frozen then it is exempt from list management. It is not * on any list except per cpu partial list. The processor that froze the - * slab is the one who can perform list operations on the page. Other + * slab is the one who can perform list operations on the slab. Other * processors may put objects onto the freelist but the processor that * froze the slab is the only one that can retrieve the objects from the - * page's freelist. + * slab's freelist. * * list_lock * @@ -135,7 +135,7 @@ * minimal so we rely on the page allocators per cpu caches for * fast frees and allocs. * - * page->frozen The slab is frozen and exempt from list processing. + * slab->frozen The slab is frozen and exempt from list processing. * This means that the slab is dedicated to a purpose * such as satisfying allocations for a specific * processor. Objects may be freed in the slab while @@ -250,7 +250,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ +#define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */ /* Internal SLUB flags */ /* Poison object */ @@ -417,18 +417,18 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) #ifdef CONFIG_SLUB_CPU_PARTIAL static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { - unsigned int nr_pages; + unsigned int nr_slabs; s->cpu_partial = nr_objects; /* * We take the number of objects but actually limit the number of - * pages on the per cpu partial list, in order to limit excessive - * growth of the list. For simplicity we assume that the pages will + * slabs on the per cpu partial list, in order to limit excessive + * growth of the list. For simplicity we assume that the slabs will * be half-full. */ - nr_pages = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); - s->cpu_partial_pages = nr_pages; + nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); + s->cpu_partial_slabs = nr_slabs; } #else static inline void @@ -440,28 +440,32 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) /* * Per slab locking using the pagelock */ -static __always_inline void __slab_lock(struct page *page) +static __always_inline void __slab_lock(struct slab *slab) { + struct page *page = slab_page(slab); + VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void __slab_unlock(struct page *page) +static __always_inline void __slab_unlock(struct slab *slab) { + struct page *page = slab_page(slab); + VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } -static __always_inline void slab_lock(struct page *page, unsigned long *flags) +static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_save(*flags); - __slab_lock(page); + __slab_lock(slab); } -static __always_inline void slab_unlock(struct page *page, unsigned long *flags) +static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) { - __slab_unlock(page); + __slab_unlock(slab); if (IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } @@ -471,7 +475,7 @@ static __always_inline void slab_unlock(struct page *page, unsigned long *flags) * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different * so we disable interrupts as part of slab_[un]lock(). */ -static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) @@ -481,7 +485,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, + if (cmpxchg_double(&slab->freelist, &slab->counters, freelist_old, counters_old, freelist_new, counters_new)) return true; @@ -491,15 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page /* init to 0 to prevent spurious warnings */ unsigned long flags = 0; - slab_lock(page, &flags); - if (page->freelist == freelist_old && - page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; - slab_unlock(page, &flags); + slab_lock(slab, &flags); + if (slab->freelist == freelist_old && + slab->counters == counters_old) { + slab->freelist = freelist_new; + slab->counters = counters_new; + slab_unlock(slab, &flags); return true; } - slab_unlock(page, &flags); + slab_unlock(slab, &flags); } cpu_relax(); @@ -512,7 +516,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page return false; } -static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) @@ -520,7 +524,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, + if (cmpxchg_double(&slab->freelist, &slab->counters, freelist_old, counters_old, freelist_new, counters_new)) return true; @@ -530,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, unsigned long flags; local_irq_save(flags); - __slab_lock(page); - if (page->freelist == freelist_old && - page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; - __slab_unlock(page); + __slab_lock(slab); + if (slab->freelist == freelist_old && + slab->counters == counters_old) { + slab->freelist = freelist_new; + slab->counters = counters_new; + __slab_unlock(slab); local_irq_restore(flags); return true; } - __slab_unlock(page); + __slab_unlock(slab); local_irq_restore(flags); } @@ -558,14 +562,14 @@ static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_RAW_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, - struct page *page) + struct slab *slab) { - void *addr = page_address(page); + void *addr = slab_address(slab); void *p; - bitmap_zero(obj_map, page->objects); + bitmap_zero(obj_map, slab->objects); - for (p = page->freelist; p; p = get_freepointer(s, p)) + for (p = slab->freelist; p; p = get_freepointer(s, p)) set_bit(__obj_to_index(s, addr, p), obj_map); } @@ -590,19 +594,19 @@ static inline bool slab_add_kunit_errors(void) { return false; } #endif /* - * Determine a map of object in use on a page. + * Determine a map of objects in use in a slab. * - * Node listlock must be held to guarantee that the page does + * Node listlock must be held to guarantee that the slab does * not vanish from under us. */ -static unsigned long *get_map(struct kmem_cache *s, struct page *page) +static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) __acquires(&object_map_lock) { VM_BUG_ON(!irqs_disabled()); raw_spin_lock(&object_map_lock); - __fill_map(object_map, s, page); + __fill_map(object_map, s, slab); return object_map; } @@ -663,17 +667,17 @@ static inline void metadata_access_disable(void) /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, void *object) + struct slab *slab, void *object) { void *base; if (!object) return 1; - base = page_address(page); + base = slab_address(slab); object = kasan_reset_tag(object); object = restore_red_left(s, object); - if (object < base || object >= base + page->objects * s->size || + if (object < base || object >= base + slab->objects * s->size || (object - base) % s->size) { return 0; } @@ -784,12 +788,13 @@ void print_tracking(struct kmem_cache *s, void *object) print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); } -static void print_page_info(struct page *page) +static void print_slab_info(const struct slab *slab) { - pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", - page, page->objects, page->inuse, page->freelist, - &page->flags); + struct folio *folio = (struct folio *)slab_folio(slab); + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", + slab, slab->objects, slab->inuse, slab->freelist, + folio_flags(folio, 0)); } static void slab_bug(struct kmem_cache *s, char *fmt, ...) @@ -822,28 +827,14 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_end(args); } -static bool freelist_corrupted(struct kmem_cache *s, struct page *page, - void **freelist, void *nextfree) -{ - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, page, nextfree) && freelist) { - object_err(s, page, *freelist, "Freechain corrupt"); - *freelist = NULL; - slab_fix(s, "Isolate corrupted freechain"); - return true; - } - - return false; -} - -static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) +static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned int off; /* Offset of last byte */ - u8 *addr = page_address(page); + u8 *addr = slab_address(slab); print_tracking(s, p); - print_page_info(page); + print_slab_info(slab); pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); @@ -875,18 +866,32 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) dump_stack(); } -void object_err(struct kmem_cache *s, struct page *page, +static void object_err(struct kmem_cache *s, struct slab *slab, u8 *object, char *reason) { if (slab_add_kunit_errors()) return; slab_bug(s, "%s", reason); - print_trailer(s, page, object); + print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, +static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, + void **freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, slab, nextfree) && freelist) { + object_err(s, slab, *freelist, "Freechain corrupt"); + *freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; @@ -899,7 +904,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); slab_bug(s, "%s", buf); - print_page_info(page); + print_slab_info(slab); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } @@ -927,13 +932,13 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, memset(from, data, to - from); } -static int check_bytes_and_report(struct kmem_cache *s, struct page *page, +static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, u8 *object, char *what, u8 *start, unsigned int value, unsigned int bytes) { u8 *fault; u8 *end; - u8 *addr = page_address(page); + u8 *addr = slab_address(slab); metadata_access_enable(); fault = memchr_inv(kasan_reset_tag(start), value, bytes); @@ -952,7 +957,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); - print_trailer(s, page, object); + print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); skip_bug_print: @@ -998,7 +1003,7 @@ skip_bug_print: * may be used with merged slabcaches. */ -static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ @@ -1011,12 +1016,12 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) if (size_from_object(s) == off) return 1; - return check_bytes_and_report(s, page, p, "Object padding", + return check_bytes_and_report(s, slab, p, "Object padding", p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ -static int slab_pad_check(struct kmem_cache *s, struct page *page) +static int slab_pad_check(struct kmem_cache *s, struct slab *slab) { u8 *start; u8 *fault; @@ -1028,8 +1033,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!(s->flags & SLAB_POISON)) return 1; - start = page_address(page); - length = page_size(page); + start = slab_address(slab); + length = slab_size(slab); end = start + length; remainder = length % s->size; if (!remainder) @@ -1044,7 +1049,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu", + slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); @@ -1052,23 +1057,23 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 0; } -static int check_object(struct kmem_cache *s, struct page *page, +static int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { u8 *p = object; u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { - if (!check_bytes_and_report(s, page, object, "Left Redzone", + if (!check_bytes_and_report(s, slab, object, "Left Redzone", object - s->red_left_pad, val, s->red_left_pad)) return 0; - if (!check_bytes_and_report(s, page, object, "Right Redzone", + if (!check_bytes_and_report(s, slab, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { - check_bytes_and_report(s, page, p, "Alignment padding", + check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, s->inuse - s->object_size); } @@ -1076,15 +1081,15 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_POISON) { if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && - (!check_bytes_and_report(s, page, p, "Poison", p, + (!check_bytes_and_report(s, slab, p, "Poison", p, POISON_FREE, s->object_size - 1) || - !check_bytes_and_report(s, page, p, "End Poison", + !check_bytes_and_report(s, slab, p, "End Poison", p + s->object_size - 1, POISON_END, 1))) return 0; /* * check_pad_bytes cleans up on its own. */ - check_pad_bytes(s, page, p); + check_pad_bytes(s, slab, p); } if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) @@ -1095,8 +1100,8 @@ static int check_object(struct kmem_cache *s, struct page *page, return 1; /* Check free pointer validity */ - if (!check_valid_pointer(s, page, get_freepointer(s, p))) { - object_err(s, page, p, "Freepointer corrupt"); + if (!check_valid_pointer(s, slab, get_freepointer(s, p))) { + object_err(s, slab, p, "Freepointer corrupt"); /* * No choice but to zap it and thus lose the remainder * of the free objects in this slab. May cause @@ -1108,55 +1113,55 @@ static int check_object(struct kmem_cache *s, struct page *page, return 1; } -static int check_slab(struct kmem_cache *s, struct page *page) +static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!PageSlab(page)) { - slab_err(s, page, "Not a valid slab page"); + if (!folio_test_slab(slab_folio(slab))) { + slab_err(s, slab, "Not a valid slab page"); return 0; } - maxobj = order_objects(compound_order(page), s->size); - if (page->objects > maxobj) { - slab_err(s, page, "objects %u > max %u", - page->objects, maxobj); + maxobj = order_objects(slab_order(slab), s->size); + if (slab->objects > maxobj) { + slab_err(s, slab, "objects %u > max %u", + slab->objects, maxobj); return 0; } - if (page->inuse > page->objects) { - slab_err(s, page, "inuse %u > max %u", - page->inuse, page->objects); + if (slab->inuse > slab->objects) { + slab_err(s, slab, "inuse %u > max %u", + slab->inuse, slab->objects); return 0; } /* Slab_pad_check fixes things up after itself */ - slab_pad_check(s, page); + slab_pad_check(s, slab); return 1; } /* - * Determine if a certain object on a page is on the freelist. Must hold the + * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; void *object = NULL; int max_objects; - fp = page->freelist; - while (fp && nr <= page->objects) { + fp = slab->freelist; + while (fp && nr <= slab->objects) { if (fp == search) return 1; - if (!check_valid_pointer(s, page, fp)) { + if (!check_valid_pointer(s, slab, fp)) { if (object) { - object_err(s, page, object, + object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); } else { - slab_err(s, page, "Freepointer corrupt"); - page->freelist = NULL; - page->inuse = page->objects; + slab_err(s, slab, "Freepointer corrupt"); + slab->freelist = NULL; + slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -1167,34 +1172,34 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = order_objects(compound_order(page), s->size); + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; - if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but should be %d", - page->objects, max_objects); - page->objects = max_objects; + if (slab->objects != max_objects) { + slab_err(s, slab, "Wrong number of objects. Found %d but should be %d", + slab->objects, max_objects); + slab->objects = max_objects; slab_fix(s, "Number of objects adjusted"); } - if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", - page->inuse, page->objects - nr); - page->inuse = page->objects - nr; + if (slab->inuse != slab->objects - nr) { + slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d", + slab->inuse, slab->objects - nr); + slab->inuse = slab->objects - nr; slab_fix(s, "Object count adjusted"); } return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, +static void trace(struct kmem_cache *s, struct slab *slab, void *object, int alloc) { if (s->flags & SLAB_TRACE) { pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", - object, page->inuse, - page->freelist); + object, slab->inuse, + slab->freelist); if (!alloc) print_section(KERN_INFO, "Object ", (void *)object, @@ -1208,22 +1213,22 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page) + struct kmem_cache_node *n, struct slab *slab) { if (!(s->flags & SLAB_STORE_USER)) return; lockdep_assert_held(&n->list_lock); - list_add(&page->slab_list, &n->full); + list_add(&slab->slab_list, &n->full); } -static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) { if (!(s->flags & SLAB_STORE_USER)) return; lockdep_assert_held(&n->list_lock); - list_del(&page->slab_list); + list_del(&slab->slab_list); } /* Tracking of the number of slabs for debugging purposes */ @@ -1263,7 +1268,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) } /* Object debug checks for alloc/free paths */ -static void setup_object_debug(struct kmem_cache *s, struct page *page, +static void setup_object_debug(struct kmem_cache *s, struct slab *slab, void *object) { if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) @@ -1274,89 +1279,89 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, } static -void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) +void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) { if (!kmem_cache_debug_flags(s, SLAB_POISON)) return; metadata_access_enable(); - memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page)); + memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab)); metadata_access_disable(); } static inline int alloc_consistency_checks(struct kmem_cache *s, - struct page *page, void *object) + struct slab *slab, void *object) { - if (!check_slab(s, page)) + if (!check_slab(s, slab)) return 0; - if (!check_valid_pointer(s, page, object)) { - object_err(s, page, object, "Freelist Pointer check fails"); + if (!check_valid_pointer(s, slab, object)) { + object_err(s, slab, object, "Freelist Pointer check fails"); return 0; } - if (!check_object(s, page, object, SLUB_RED_INACTIVE)) + if (!check_object(s, slab, object, SLUB_RED_INACTIVE)) return 0; return 1; } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, + struct slab *slab, void *object, unsigned long addr) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!alloc_consistency_checks(s, page, object)) + if (!alloc_consistency_checks(s, slab, object)) goto bad; } /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); - trace(s, page, object, 1); + trace(s, slab, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; bad: - if (PageSlab(page)) { + if (folio_test_slab(slab_folio(slab))) { /* * If this is a slab page then lets do the best we can * to avoid issues in the future. Marking all objects * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = page->objects; - page->freelist = NULL; + slab->inuse = slab->objects; + slab->freelist = NULL; } return 0; } static inline int free_consistency_checks(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) + struct slab *slab, void *object, unsigned long addr) { - if (!check_valid_pointer(s, page, object)) { - slab_err(s, page, "Invalid object pointer 0x%p", object); + if (!check_valid_pointer(s, slab, object)) { + slab_err(s, slab, "Invalid object pointer 0x%p", object); return 0; } - if (on_freelist(s, page, object)) { - object_err(s, page, object, "Object already free"); + if (on_freelist(s, slab, object)) { + object_err(s, slab, object, "Object already free"); return 0; } - if (!check_object(s, page, object, SLUB_RED_ACTIVE)) + if (!check_object(s, slab, object, SLUB_RED_ACTIVE)) return 0; - if (unlikely(s != page->slab_cache)) { - if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) outside of slab", + if (unlikely(s != slab->slab_cache)) { + if (!folio_test_slab(slab_folio(slab))) { + slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); - } else if (!page->slab_cache) { + } else if (!slab->slab_cache) { pr_err("SLUB <none>: no slab for object 0x%p.\n", object); dump_stack(); } else - object_err(s, page, object, + object_err(s, slab, object, "page slab pointer corrupt."); return 0; } @@ -1365,21 +1370,21 @@ static inline int free_consistency_checks(struct kmem_cache *s, /* Supports checking bulk free of a constructed freelist */ static noinline int free_debug_processing( - struct kmem_cache *s, struct page *page, + struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); void *object = head; int cnt = 0; unsigned long flags, flags2; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); - slab_lock(page, &flags2); + slab_lock(slab, &flags2); if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, page)) + if (!check_slab(s, slab)) goto out; } @@ -1387,13 +1392,13 @@ next_object: cnt++; if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, page, object, addr)) + if (!free_consistency_checks(s, slab, object, addr)) goto out; } if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); - trace(s, page, object, 0); + trace(s, slab, object, 0); /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ init_object(s, object, SLUB_RED_INACTIVE); @@ -1406,10 +1411,10 @@ next_object: out: if (cnt != bulk_cnt) - slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", + slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); - slab_unlock(page, &flags2); + slab_unlock(slab, &flags2); spin_unlock_irqrestore(&n->list_lock, flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); @@ -1624,26 +1629,26 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, - struct page *page, void *object) {} + struct slab *slab, void *object) {} static inline -void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} +void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) { return 0; } + struct slab *slab, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing( - struct kmem_cache *s, struct page *page, + struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, unsigned long addr) { return 0; } -static inline int slab_pad_check(struct kmem_cache *s, struct page *page) +static inline int slab_pad_check(struct kmem_cache *s, struct slab *slab) { return 1; } -static inline int check_object(struct kmem_cache *s, struct page *page, +static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, - struct page *page) {} + struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, - struct page *page) {} + struct slab *slab) {} slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name) { @@ -1662,7 +1667,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static bool freelist_corrupted(struct kmem_cache *s, struct page *page, +static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; @@ -1767,10 +1772,10 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, return *head != NULL; } -static void *setup_object(struct kmem_cache *s, struct page *page, +static void *setup_object(struct kmem_cache *s, struct slab *slab, void *object) { - setup_object_debug(s, page, object); + setup_object_debug(s, slab, object); object = kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { kasan_unpoison_object_data(s, object); @@ -1783,18 +1788,27 @@ static void *setup_object(struct kmem_cache *s, struct page *page, /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(struct kmem_cache *s, +static inline struct slab *alloc_slab_page(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_order_objects oo) { - struct page *page; + struct folio *folio; + struct slab *slab; unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) - page = alloc_pages(flags, order); + folio = (struct folio *)alloc_pages(flags, order); else - page = __alloc_pages_node(node, flags, order); + folio = (struct folio *)__alloc_pages_node(node, flags, order); + + if (!folio) + return NULL; + + slab = folio_slab(folio); + __folio_set_slab(folio); + if (page_is_pfmemalloc(folio_page(folio, 0))) + slab_set_pfmemalloc(slab); - return page; + return slab; } #ifdef CONFIG_SLAB_FREELIST_RANDOM @@ -1839,7 +1853,7 @@ static void __init init_freelist_randomization(void) } /* Get the next entry on the pre-computed freelist randomized */ -static void *next_freelist_entry(struct kmem_cache *s, struct page *page, +static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, unsigned long *pos, void *start, unsigned long page_limit, unsigned long freelist_count) @@ -1861,32 +1875,32 @@ static void *next_freelist_entry(struct kmem_cache *s, struct page *page, } /* Shuffle the single linked freelist based on a random pre-computed sequence */ -static bool shuffle_freelist(struct kmem_cache *s, struct page *page) +static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) { void *start; void *cur; void *next; unsigned long idx, pos, page_limit, freelist_count; - if (page->objects < 2 || !s->random_seq) + if (slab->objects < 2 || !s->random_seq) return false; freelist_count = oo_objects(s->oo); pos = get_random_int() % freelist_count; - page_limit = page->objects * s->size; - start = fixup_red_left(s, page_address(page)); + page_limit = slab->objects * s->size; + start = fixup_red_left(s, slab_address(slab)); /* First entry is used as the base of the freelist */ - cur = next_freelist_entry(s, page, &pos, start, page_limit, + cur = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); - cur = setup_object(s, page, cur); - page->freelist = cur; + cur = setup_object(s, slab, cur); + slab->freelist = cur; - for (idx = 1; idx < page->objects; idx++) { - next = next_freelist_entry(s, page, &pos, start, page_limit, + for (idx = 1; idx < slab->objects; idx++) { + next = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); - next = setup_object(s, page, next); + next = setup_object(s, slab, next); set_freepointer(s, cur, next); cur = next; } @@ -1900,15 +1914,15 @@ static inline int init_cache_random_seq(struct kmem_cache *s) return 0; } static inline void init_freelist_randomization(void) { } -static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page) +static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) { return false; } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ -static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { - struct page *page; + struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p, *next; @@ -1927,63 +1941,60 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); - page = alloc_slab_page(s, alloc_gfp, node, oo); - if (unlikely(!page)) { + slab = alloc_slab_page(s, alloc_gfp, node, oo); + if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(s, alloc_gfp, node, oo); - if (unlikely(!page)) + slab = alloc_slab_page(s, alloc_gfp, node, oo); + if (unlikely(!slab)) goto out; stat(s, ORDER_FALLBACK); } - page->objects = oo_objects(oo); + slab->objects = oo_objects(oo); - account_slab_page(page, oo_order(oo), s, flags); + account_slab(slab, oo_order(oo), s, flags); - page->slab_cache = s; - __SetPageSlab(page); - if (page_is_pfmemalloc(page)) - SetPageSlabPfmemalloc(page); + slab->slab_cache = s; - kasan_poison_slab(page); + kasan_poison_slab(slab); - start = page_address(page); + start = slab_address(slab); - setup_page_debug(s, page, start); + setup_slab_debug(s, slab, start); - shuffle = shuffle_freelist(s, page); + shuffle = shuffle_freelist(s, slab); if (!shuffle) { start = fixup_red_left(s, start); - start = setup_object(s, page, start); - page->freelist = start; - for (idx = 0, p = start; idx < page->objects - 1; idx++) { + start = setup_object(s, slab, start); + slab->freelist = start; + for (idx = 0, p = start; idx < slab->objects - 1; idx++) { next = p + s->size; - next = setup_object(s, page, next); + next = setup_object(s, slab, next); set_freepointer(s, p, next); p = next; } set_freepointer(s, p, NULL); } - page->inuse = page->objects; - page->frozen = 1; + slab->inuse = slab->objects; + slab->frozen = 1; out: - if (!page) + if (!slab) return NULL; - inc_slabs_node(s, page_to_nid(page), page->objects); + inc_slabs_node(s, slab_nid(slab), slab->objects); - return page; + return slab; } -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) { if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); @@ -1994,76 +2005,75 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } -static void __free_slab(struct kmem_cache *s, struct page *page) +static void __free_slab(struct kmem_cache *s, struct slab *slab) { - int order = compound_order(page); + struct folio *folio = slab_folio(slab); + int order = folio_order(folio); int pages = 1 << order; if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { void *p; - slab_pad_check(s, page); - for_each_object(p, s, page_address(page), - page->objects) - check_object(s, page, p, SLUB_RED_INACTIVE); + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); } - __ClearPageSlabPfmemalloc(page); - __ClearPageSlab(page); - /* In union with page->mapping where page allocator expects NULL */ - page->slab_cache = NULL; + __slab_clear_pfmemalloc(slab); + __folio_clear_slab(folio); + folio->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - unaccount_slab_page(page, order, s); - __free_pages(page, order); + unaccount_slab(slab, order, s); + __free_pages(folio_page(folio, 0), order); } static void rcu_free_slab(struct rcu_head *h) { - struct page *page = container_of(h, struct page, rcu_head); + struct slab *slab = container_of(h, struct slab, rcu_head); - __free_slab(page->slab_cache, page); + __free_slab(slab->slab_cache, slab); } -static void free_slab(struct kmem_cache *s, struct page *page) +static void free_slab(struct kmem_cache *s, struct slab *slab) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - call_rcu(&page->rcu_head, rcu_free_slab); + call_rcu(&slab->rcu_head, rcu_free_slab); } else - __free_slab(s, page); + __free_slab(s, slab); } -static void discard_slab(struct kmem_cache *s, struct page *page) +static void discard_slab(struct kmem_cache *s, struct slab *slab) { - dec_slabs_node(s, page_to_nid(page), page->objects); - free_slab(s, page); + dec_slabs_node(s, slab_nid(slab), slab->objects); + free_slab(s, slab); } /* * Management of partially allocated slabs. */ static inline void -__add_partial(struct kmem_cache_node *n, struct page *page, int tail) +__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) - list_add_tail(&page->slab_list, &n->partial); + list_add_tail(&slab->slab_list, &n->partial); else - list_add(&page->slab_list, &n->partial); + list_add(&slab->slab_list, &n->partial); } static inline void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) + struct slab *slab, int tail) { lockdep_assert_held(&n->list_lock); - __add_partial(n, page, tail); + __add_partial(n, slab, tail); } static inline void remove_partial(struct kmem_cache_node *n, - struct page *page) + struct slab *slab) { lockdep_assert_held(&n->list_lock); - list_del(&page->slab_list); + list_del(&slab->slab_list); n->nr_partial--; } @@ -2074,12 +2084,12 @@ static inline void remove_partial(struct kmem_cache_node *n, * Returns a list of objects or NULL if it fails. */ static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page, + struct kmem_cache_node *n, struct slab *slab, int mode) { void *freelist; unsigned long counters; - struct page new; + struct slab new; lockdep_assert_held(&n->list_lock); @@ -2088,11 +2098,11 @@ static inline void *acquire_slab(struct kmem_cache *s, * The old freelist is the list of objects for the * per cpu allocation list. */ - freelist = page->freelist; - counters = page->counters; + freelist = slab->freelist; + counters = slab->counters; new.counters = counters; if (mode) { - new.inuse = page->objects; + new.inuse = slab->objects; new.freelist = NULL; } else { new.freelist = freelist; @@ -2101,35 +2111,35 @@ static inline void *acquire_slab(struct kmem_cache *s, VM_BUG_ON(new.frozen); new.frozen = 1; - if (!__cmpxchg_double_slab(s, page, + if (!__cmpxchg_double_slab(s, slab, freelist, counters, new.freelist, new.counters, "acquire_slab")) return NULL; - remove_partial(n, page); + remove_partial(n, slab); WARN_ON(!freelist); return freelist; } #ifdef CONFIG_SLUB_CPU_PARTIAL -static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); #else -static inline void put_cpu_partial(struct kmem_cache *s, struct page *page, +static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { } #endif -static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); +static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct page **ret_page, gfp_t gfpflags) + struct slab **ret_slab, gfp_t gfpflags) { - struct page *page, *page2; + struct slab *slab, *slab2; void *object = NULL; unsigned long flags; - unsigned int partial_pages = 0; + unsigned int partial_slabs = 0; /* * Racy check. If we mistakenly see no partial slabs then we @@ -2141,28 +2151,28 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, return NULL; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, page2, &n->partial, slab_list) { + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(page, gfpflags)) + if (!pfmemalloc_match(slab, gfpflags)) continue; - t = acquire_slab(s, n, page, object == NULL); + t = acquire_slab(s, n, slab, object == NULL); if (!t) break; if (!object) { - *ret_page = page; + *ret_slab = slab; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { - put_cpu_partial(s, page, 0); + put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_pages++; + partial_slabs++; } #ifdef CONFIG_SLUB_CPU_PARTIAL if (!kmem_cache_has_cpu_partial(s) - || partial_pages > s->cpu_partial_pages / 2) + || partial_slabs > s->cpu_partial_slabs / 2) break; #else break; @@ -2174,10 +2184,10 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, } /* - * Get a page from somewhere. Search in increasing NUMA distances. + * Get a slab from somewhere. Search in increasing NUMA distances. */ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct page **ret_page) + struct slab **ret_slab) { #ifdef CONFIG_NUMA struct zonelist *zonelist; @@ -2219,7 +2229,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, ret_page, flags); + object = get_partial_node(s, n, ret_slab, flags); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2238,10 +2248,10 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, } /* - * Get a partial page, lock it and return it. + * Get a partial slab, lock it and return it. */ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct page **ret_page) + struct slab **ret_slab) { void *object; int searchnode = node; @@ -2249,11 +2259,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), ret_page, flags); + object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, ret_page); + return get_any_partial(s, flags, ret_slab); } #ifdef CONFIG_PREEMPTION @@ -2330,25 +2340,25 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) } /* - * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist, + * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, * unfreezes the slabs and puts it on the proper list. * Assumes the slab has been already safely taken away from kmem_cache_cpu * by the caller. */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, +static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int lock = 0, free_delta = 0; enum slab_modes l = M_NONE, m = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; - struct page new; - struct page old; + struct slab new; + struct slab old; - if (page->freelist) { + if (slab->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } @@ -2367,7 +2377,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * 'freelist_iter' is already corrupted. So isolate all objects * starting at 'freelist_iter' by skipping them. */ - if (freelist_corrupted(s, page, &freelist_iter, nextfree)) + if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) break; freelist_tail = freelist_iter; @@ -2377,25 +2387,25 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, } /* - * Stage two: Unfreeze the page while splicing the per-cpu - * freelist to the head of page's freelist. + * Stage two: Unfreeze the slab while splicing the per-cpu + * freelist to the head of slab's freelist. * - * Ensure that the page is unfrozen while the list presence + * Ensure that the slab is unfrozen while the list presence * reflects the actual number of objects during unfreeze. * * We setup the list membership and then perform a cmpxchg - * with the count. If there is a mismatch then the page - * is not unfrozen but the page is on the wrong list. + * with the count. If there is a mismatch then the slab + * is not unfrozen but the slab is on the wrong list. * * Then we restart the process which may have to remove - * the page from the list that we just put it on again + * the slab from the list that we just put it on again * because the number of objects in the slab may have * changed. */ redo: - old.freelist = READ_ONCE(page->freelist); - old.counters = READ_ONCE(page->counters); + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); VM_BUG_ON(!old.frozen); /* Determine target state of the slab */ @@ -2416,9 +2426,8 @@ redo: if (!lock) { lock = 1; /* - * Taking the spinlock removes the possibility - * that acquire_slab() will see a slab page that - * is frozen + * Taking the spinlock removes the possibility that + * acquire_slab() will see a slab that is frozen */ spin_lock_irqsave(&n->list_lock, flags); } @@ -2437,18 +2446,18 @@ redo: if (l != m) { if (l == M_PARTIAL) - remove_partial(n, page); + remove_partial(n, slab); else if (l == M_FULL) - remove_full(s, n, page); + remove_full(s, n, slab); if (m == M_PARTIAL) - add_partial(n, page, tail); + add_partial(n, slab, tail); else if (m == M_FULL) - add_full(s, n, page); + add_full(s, n, slab); } l = m; - if (!cmpxchg_double_slab(s, page, + if (!cmpxchg_double_slab(s, slab, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) @@ -2463,26 +2472,26 @@ redo: stat(s, DEACTIVATE_FULL); else if (m == M_FREE) { stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); + discard_slab(s, slab); stat(s, FREE_SLAB); } } #ifdef CONFIG_SLUB_CPU_PARTIAL -static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) +static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) { struct kmem_cache_node *n = NULL, *n2 = NULL; - struct page *page, *discard_page = NULL; + struct slab *slab, *slab_to_discard = NULL; unsigned long flags = 0; - while (partial_page) { - struct page new; - struct page old; + while (partial_slab) { + struct slab new; + struct slab old; - page = partial_page; - partial_page = page->next; + slab = partial_slab; + partial_slab = slab->next; - n2 = get_node(s, page_to_nid(page)); + n2 = get_node(s, slab_nid(slab)); if (n != n2) { if (n) spin_unlock_irqrestore(&n->list_lock, flags); @@ -2493,8 +2502,8 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) do { - old.freelist = page->freelist; - old.counters = page->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; VM_BUG_ON(!old.frozen); new.counters = old.counters; @@ -2502,16 +2511,16 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) new.frozen = 0; - } while (!__cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, slab, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")); if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { - page->next = discard_page; - discard_page = page; + slab->next = slab_to_discard; + slab_to_discard = slab; } else { - add_partial(n, page, DEACTIVATE_TO_TAIL); + add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } @@ -2519,12 +2528,12 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) if (n) spin_unlock_irqrestore(&n->list_lock, flags); - while (discard_page) { - page = discard_page; - discard_page = discard_page->next; + while (slab_to_discard) { + slab = slab_to_discard; + slab_to_discard = slab_to_discard->next; stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); + discard_slab(s, slab); stat(s, FREE_SLAB); } } @@ -2534,73 +2543,73 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) */ static void unfreeze_partials(struct kmem_cache *s) { - struct page *partial_page; + struct slab *partial_slab; unsigned long flags; local_lock_irqsave(&s->cpu_slab->lock, flags); - partial_page = this_cpu_read(s->cpu_slab->partial); + partial_slab = this_cpu_read(s->cpu_slab->partial); this_cpu_write(s->cpu_slab->partial, NULL); local_unlock_irqrestore(&s->cpu_slab->lock, flags); - if (partial_page) - __unfreeze_partials(s, partial_page); + if (partial_slab) + __unfreeze_partials(s, partial_slab); } static void unfreeze_partials_cpu(struct kmem_cache *s, struct kmem_cache_cpu *c) { - struct page *partial_page; + struct slab *partial_slab; - partial_page = slub_percpu_partial(c); + partial_slab = slub_percpu_partial(c); c->partial = NULL; - if (partial_page) - __unfreeze_partials(s, partial_page); + if (partial_slab) + __unfreeze_partials(s, partial_slab); } /* - * Put a page that was just frozen (in __slab_free|get_partial_node) into a - * partial page slot if available. + * Put a slab that was just frozen (in __slab_free|get_partial_node) into a + * partial slab slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. */ -static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { - struct page *oldpage; - struct page *page_to_unfreeze = NULL; + struct slab *oldslab; + struct slab *slab_to_unfreeze = NULL; unsigned long flags; - int pages = 0; + int slabs = 0; local_lock_irqsave(&s->cpu_slab->lock, flags); - oldpage = this_cpu_read(s->cpu_slab->partial); + oldslab = this_cpu_read(s->cpu_slab->partial); - if (oldpage) { - if (drain && oldpage->pages >= s->cpu_partial_pages) { + if (oldslab) { + if (drain && oldslab->slabs >= s->cpu_partial_slabs) { /* * Partial array is full. Move the existing set to the * per node partial list. Postpone the actual unfreezing * outside of the critical section. */ - page_to_unfreeze = oldpage; - oldpage = NULL; + slab_to_unfreeze = oldslab; + oldslab = NULL; } else { - pages = oldpage->pages; + slabs = oldslab->slabs; } } - pages++; + slabs++; - page->pages = pages; - page->next = oldpage; + slab->slabs = slabs; + slab->next = oldslab; - this_cpu_write(s->cpu_slab->partial, page); + this_cpu_write(s->cpu_slab->partial, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); - if (page_to_unfreeze) { - __unfreeze_partials(s, page_to_unfreeze); + if (slab_to_unfreeze) { + __unfreeze_partials(s, slab_to_unfreeze); stat(s, CPU_PARTIAL_DRAIN); } } @@ -2616,22 +2625,22 @@ static inline void unfreeze_partials_cpu(struct kmem_cache *s, static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { unsigned long flags; - struct page *page; + struct slab *slab; void *freelist; local_lock_irqsave(&s->cpu_slab->lock, flags); - page = c->page; + slab = c->slab; freelist = c->freelist; - c->page = NULL; + c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); - if (page) { - deactivate_slab(s, page, freelist); + if (slab) { + deactivate_slab(s, slab, freelist); stat(s, CPUSLAB_FLUSH); } } @@ -2640,14 +2649,14 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); void *freelist = c->freelist; - struct page *page = c->page; + struct slab *slab = c->slab; - c->page = NULL; + c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - if (page) { - deactivate_slab(s, page, freelist); + if (slab) { + deactivate_slab(s, slab, freelist); stat(s, CPUSLAB_FLUSH); } @@ -2676,7 +2685,7 @@ static void flush_cpu_slab(struct work_struct *w) s = sfw->s; c = this_cpu_ptr(s->cpu_slab); - if (c->page) + if (c->slab) flush_slab(s, c); unfreeze_partials(s); @@ -2686,7 +2695,7 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return c->page || slub_percpu_partial(c); + return c->slab || slub_percpu_partial(c); } static DEFINE_MUTEX(flush_lock); @@ -2748,19 +2757,19 @@ static int slub_cpu_dead(unsigned int cpu) * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct page *page, int node) +static inline int node_match(struct slab *slab, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && page_to_nid(page) != node) + if (node != NUMA_NO_NODE && slab_nid(slab) != node) return 0; #endif return 1; } #ifdef CONFIG_SLUB_DEBUG -static int count_free(struct page *page) +static int count_free(struct slab *slab) { - return page->objects - page->inuse; + return slab->objects - slab->inuse; } static inline unsigned long node_nr_objs(struct kmem_cache_node *n) @@ -2771,15 +2780,15 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, - int (*get_count)(struct page *)) + int (*get_count)(struct slab *)) { unsigned long flags; unsigned long x = 0; - struct page *page; + struct slab *slab; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, slab_list) - x += get_count(page); + list_for_each_entry(slab, &n->partial, slab_list) + x += get_count(slab); spin_unlock_irqrestore(&n->list_lock, flags); return x; } @@ -2822,54 +2831,41 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) #endif } -static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { - if (unlikely(PageSlabPfmemalloc(page))) + if (unlikely(slab_test_pfmemalloc(slab))) return gfp_pfmemalloc_allowed(gfpflags); return true; } /* - * A variant of pfmemalloc_match() that tests page flags without asserting - * PageSlab. Intended for opportunistic checks before taking a lock and - * rechecking that nobody else freed the page under us. - */ -static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) -{ - if (unlikely(__PageSlabPfmemalloc(page))) - return gfp_pfmemalloc_allowed(gfpflags); - - return true; -} - -/* - * Check the page->freelist of a page and either transfer the freelist to the - * per cpu freelist or deactivate the page. + * Check the slab->freelist and either transfer the freelist to the + * per cpu freelist or deactivate the slab. * - * The page is still frozen if the return value is not NULL. + * The slab is still frozen if the return value is not NULL. * - * If this function returns NULL then the page has been unfrozen. + * If this function returns NULL then the slab has been unfrozen. */ -static inline void *get_freelist(struct kmem_cache *s, struct page *page) +static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) { - struct page new; + struct slab new; unsigned long counters; void *freelist; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { - freelist = page->freelist; - counters = page->counters; + freelist = slab->freelist; + counters = slab->counters; new.counters = counters; VM_BUG_ON(!new.frozen); - new.inuse = page->objects; + new.inuse = slab->objects; new.frozen = freelist != NULL; - } while (!__cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, slab, freelist, counters, NULL, new.counters, "get_freelist")); @@ -2900,15 +2896,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *freelist; - struct page *page; + struct slab *slab; unsigned long flags; stat(s, ALLOC_SLOWPATH); -reread_page: +reread_slab: - page = READ_ONCE(c->page); - if (!page) { + slab = READ_ONCE(c->slab); + if (!slab) { /* * if the node is not online or has no normal memory, just * ignore the node constraint @@ -2920,7 +2916,7 @@ reread_page: } redo: - if (unlikely(!node_match(page, node))) { + if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already * implies node != NUMA_NO_NODE @@ -2939,23 +2935,23 @@ redo: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) goto deactivate_slab; - /* must check again c->page in case we got preempted and it changed */ + /* must check again c->slab in case we got preempted and it changed */ local_lock_irqsave(&s->cpu_slab->lock, flags); - if (unlikely(page != c->page)) { + if (unlikely(slab != c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); - goto reread_page; + goto reread_slab; } freelist = c->freelist; if (freelist) goto load_freelist; - freelist = get_freelist(s, page); + freelist = get_freelist(s, slab); if (!freelist) { - c->page = NULL; + c->slab = NULL; local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2969,10 +2965,10 @@ load_freelist: /* * freelist is pointing to the list of objects to be used. - * page is pointing to the page from which the objects are obtained. - * That page must be frozen for per cpu allocations to work. + * slab is pointing to the slab from which the objects are obtained. + * That slab must be frozen for per cpu allocations to work. */ - VM_BUG_ON(!c->page->frozen); + VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); @@ -2981,23 +2977,23 @@ load_freelist: deactivate_slab: local_lock_irqsave(&s->cpu_slab->lock, flags); - if (page != c->page) { + if (slab != c->slab) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); - goto reread_page; + goto reread_slab; } freelist = c->freelist; - c->page = NULL; + c->slab = NULL; c->freelist = NULL; local_unlock_irqrestore(&s->cpu_slab->lock, flags); - deactivate_slab(s, page, freelist); + deactivate_slab(s, slab, freelist); new_slab: if (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); - if (unlikely(c->page)) { + if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); - goto reread_page; + goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); @@ -3005,8 +3001,8 @@ new_slab: goto new_objects; } - page = c->page = slub_percpu_partial(c); - slub_set_percpu_partial(c, page); + slab = c->slab = slub_percpu_partial(c); + slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; @@ -3014,32 +3010,32 @@ new_slab: new_objects: - freelist = get_partial(s, gfpflags, node, &page); + freelist = get_partial(s, gfpflags, node, &slab); if (freelist) - goto check_new_page; + goto check_new_slab; slub_put_cpu_ptr(s->cpu_slab); - page = new_slab(s, gfpflags, node); + slab = new_slab(s, gfpflags, node); c = slub_get_cpu_ptr(s->cpu_slab); - if (unlikely(!page)) { + if (unlikely(!slab)) { slab_out_of_memory(s, gfpflags, node); return NULL; } /* - * No other reference to the page yet so we can + * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ - freelist = page->freelist; - page->freelist = NULL; + freelist = slab->freelist; + slab->freelist = NULL; stat(s, ALLOC_SLAB); -check_new_page: +check_new_slab: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, page, freelist, addr)) { + if (!alloc_debug_processing(s, slab, freelist, addr)) { /* Slab failed checks. Next slab needed */ goto new_slab; } else { @@ -3051,39 +3047,39 @@ check_new_page: } } - if (unlikely(!pfmemalloc_match(page, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ goto return_single; -retry_load_page: +retry_load_slab: local_lock_irqsave(&s->cpu_slab->lock, flags); - if (unlikely(c->page)) { + if (unlikely(c->slab)) { void *flush_freelist = c->freelist; - struct page *flush_page = c->page; + struct slab *flush_slab = c->slab; - c->page = NULL; + c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); - deactivate_slab(s, flush_page, flush_freelist); + deactivate_slab(s, flush_slab, flush_freelist); stat(s, CPUSLAB_FLUSH); - goto retry_load_page; + goto retry_load_slab; } - c->page = page; + c->slab = slab; goto load_freelist; return_single: - deactivate_slab(s, page, get_freepointer(s, freelist)); + deactivate_slab(s, slab, get_freepointer(s, freelist)); return freelist; } @@ -3140,7 +3136,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, { void *object; struct kmem_cache_cpu *c; - struct page *page; + struct slab *slab; unsigned long tid; struct obj_cgroup *objcg = NULL; bool init = false; @@ -3172,9 +3168,9 @@ redo: /* * Irqless object alloc/free algorithm used here depends on sequence * of fetching cpu_slab's data. tid should be fetched before anything - * on c to guarantee that object and page associated with previous tid + * on c to guarantee that object and slab associated with previous tid * won't be used with current tid. If we fetch tid first, object and - * page could be one associated with next tid and our alloc/free + * slab could be one associated with next tid and our alloc/free * request will be failed. In this case, we will retry. So, no problem. */ barrier(); @@ -3187,7 +3183,7 @@ redo: */ object = c->freelist; - page = c->page; + slab = c->slab; /* * We cannot use the lockless fastpath on PREEMPT_RT because if a * slowpath has taken the local_lock_irqsave(), it is not protected @@ -3196,7 +3192,7 @@ redo: * there is a suitable cpu freelist. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) || - unlikely(!object || !page || !node_match(page, node))) { + unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); @@ -3298,17 +3294,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); * have a longer lifetime than the cpu slabs in most processing loads. * * So we still attempt to reduce cache line usage. Just take the slab - * lock and free the item. If there is no additional partial page + * lock and free the item. If there is no additional partial slab * handling required then we can return immediately. */ -static void __slab_free(struct kmem_cache *s, struct page *page, +static void __slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { void *prior; int was_frozen; - struct page new; + struct slab new; unsigned long counters; struct kmem_cache_node *n = NULL; unsigned long flags; @@ -3319,7 +3315,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; if (kmem_cache_debug(s) && - !free_debug_processing(s, page, head, tail, cnt, addr)) + !free_debug_processing(s, slab, head, tail, cnt, addr)) return; do { @@ -3327,8 +3323,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - prior = page->freelist; - counters = page->counters; + prior = slab->freelist; + counters = slab->counters; set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; @@ -3347,7 +3343,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } else { /* Needs to be taken off a list */ - n = get_node(s, page_to_nid(page)); + n = get_node(s, slab_nid(slab)); /* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may @@ -3361,7 +3357,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } } - } while (!cmpxchg_double_slab(s, page, + } while (!cmpxchg_double_slab(s, slab, prior, counters, head, new.counters, "__slab_free")); @@ -3376,10 +3372,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_FROZEN); } else if (new.frozen) { /* - * If we just froze the page then put it onto the + * If we just froze the slab then put it onto the * per cpu partial list. */ - put_cpu_partial(s, page, 1); + put_cpu_partial(s, slab, 1); stat(s, CPU_PARTIAL_FREE); } @@ -3394,8 +3390,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - remove_full(s, n, page); - add_partial(n, page, DEACTIVATE_TO_TAIL); + remove_full(s, n, slab); + add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -3406,16 +3402,16 @@ slab_empty: /* * Slab on the partial list. */ - remove_partial(n, page); + remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } else { /* Slab must be on the full list */ - remove_full(s, n, page); + remove_full(s, n, slab); } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); - discard_slab(s, page); + discard_slab(s, slab); } /* @@ -3430,11 +3426,11 @@ slab_empty: * with all sorts of special processing. * * Bulk free of a freelist with several objects (all pointing to the - * same page) possible by specifying head and tail ptr, plus objects + * same slab) possible by specifying head and tail ptr, plus objects * count (cnt). Bulk free indicated by tail pointer being set. */ static __always_inline void do_slab_free(struct kmem_cache *s, - struct page *page, void *head, void *tail, + struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { void *tail_obj = tail ? : head; @@ -3457,7 +3453,7 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); - if (likely(page == c->page)) { + if (likely(slab == c->slab)) { #ifndef CONFIG_PREEMPT_RT void **freelist = READ_ONCE(c->freelist); @@ -3483,7 +3479,7 @@ redo: local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); - if (unlikely(page != c->page)) { + if (unlikely(slab != c->slab)) { local_unlock(&s->cpu_slab->lock); goto redo; } @@ -3498,11 +3494,11 @@ redo: #endif stat(s, FREE_FASTPATH); } else - __slab_free(s, page, head, tail_obj, cnt, addr); + __slab_free(s, slab, head, tail_obj, cnt, addr); } -static __always_inline void slab_free(struct kmem_cache *s, struct page *page, +static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { @@ -3511,13 +3507,13 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, * to remove objects, whose reuse must be delayed. */ if (slab_free_freelist_hook(s, &head, &tail, &cnt)) - do_slab_free(s, page, head, tail, cnt, addr); + do_slab_free(s, slab, head, tail, cnt, addr); } #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); + do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); } #endif @@ -3527,35 +3523,36 @@ void kmem_cache_free(struct kmem_cache *s, void *x) if (!s) return; trace_kmem_cache_free(_RET_IP_, x, s->name); - slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); + slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); struct detached_freelist { - struct page *page; + struct slab *slab; void *tail; void *freelist; int cnt; struct kmem_cache *s; }; -static inline void free_nonslab_page(struct page *page, void *object) +static inline void free_large_kmalloc(struct folio *folio, void *object) { - unsigned int order = compound_order(page); + unsigned int order = folio_order(folio); - if (WARN_ON_ONCE(!PageCompound(page))) + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); kfree_hook(object); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); - __free_pages(page, order); + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); } /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same - * page. It builds a detached freelist directly within the given - * page/objects. This can happen without any need for + * slab. It builds a detached freelist directly within the given + * slab/objects. This can happen without any need for * synchronization, because the objects are owned by running process. * The freelist is build up as a single linked list in the objects. * The idea is, that this detached freelist can then be bulk @@ -3570,10 +3567,11 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, size_t first_skipped_index = 0; int lookahead = 3; void *object; - struct page *page; + struct folio *folio; + struct slab *slab; /* Always re-init detached_freelist */ - df->page = NULL; + df->slab = NULL; do { object = p[--size]; @@ -3583,17 +3581,19 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!object) return 0; - page = virt_to_head_page(object); + folio = virt_to_folio(object); if (!s) { /* Handle kalloc'ed objects */ - if (unlikely(!PageSlab(page))) { - free_nonslab_page(page, object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, object); p[size] = NULL; /* mark object processed */ return size; } /* Derive kmem_cache from object */ - df->s = page->slab_cache; + slab = folio_slab(folio); + df->s = slab->slab_cache; } else { + slab = folio_slab(folio); df->s = cache_from_obj(s, object); /* Support for memcg */ } @@ -3605,7 +3605,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, } /* Start new detached freelist */ - df->page = page; + df->slab = slab; set_freepointer(df->s, object, NULL); df->tail = object; df->freelist = object; @@ -3617,8 +3617,8 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!object) continue; /* Skip processed objects */ - /* df->page is always set at this point */ - if (df->page == virt_to_head_page(object)) { + /* df->slab is always set at this point */ + if (df->slab == virt_to_slab(object)) { /* Opportunity build freelist */ set_freepointer(df->s, object, df->freelist); df->freelist = object; @@ -3650,10 +3650,10 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) struct detached_freelist df; size = build_detached_freelist(s, size, p, &df); - if (!df.page) + if (!df.slab) continue; - slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); @@ -3787,7 +3787,7 @@ static unsigned int slub_min_objects; * requested a higher minimum order then we start with that one instead of * the smallest order which will fit the object. */ -static inline unsigned int slab_order(unsigned int size, +static inline unsigned int calc_slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, unsigned int fract_leftover) { @@ -3851,7 +3851,7 @@ static inline int calculate_order(unsigned int size) fraction = 16; while (fraction >= 4) { - order = slab_order(size, min_objects, + order = calc_slab_order(size, min_objects, slub_max_order, fraction); if (order <= slub_max_order) return order; @@ -3864,14 +3864,14 @@ static inline int calculate_order(unsigned int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = calc_slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = calc_slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -3923,38 +3923,38 @@ static struct kmem_cache *kmem_cache_node; */ static void early_kmem_cache_node_alloc(int node) { - struct page *page; + struct slab *slab; struct kmem_cache_node *n; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmem_cache_node, GFP_NOWAIT, node); + slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); - BUG_ON(!page); - if (page_to_nid(page) != node) { + BUG_ON(!slab); + if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } - n = page->freelist; + n = slab->freelist; BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); - page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse = 1; - page->frozen = 0; + slab->freelist = get_freepointer(kmem_cache_node, n); + slab->inuse = 1; + slab->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); - inc_slabs_node(kmem_cache_node, node, page->objects); + inc_slabs_node(kmem_cache_node, node, slab->objects); /* * No locks need to be taken here as it has just been * initialized and there is no concurrent access. */ - __add_partial(n, page, DEACTIVATE_TO_HEAD); + __add_partial(n, slab, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -4212,7 +4212,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) #endif /* - * The larger the object size is, the more pages we want on the partial + * The larger the object size is, the more slabs we want on the partial * list to avoid pounding the page allocator excessively. */ set_min_partial(s, ilog2(s->size) / 2); @@ -4240,20 +4240,20 @@ error: return -EINVAL; } -static void list_slab_objects(struct kmem_cache *s, struct page *page, +static void list_slab_objects(struct kmem_cache *s, struct slab *slab, const char *text) { #ifdef CONFIG_SLUB_DEBUG - void *addr = page_address(page); + void *addr = slab_address(slab); unsigned long flags; unsigned long *map; void *p; - slab_err(s, page, text, s->name); - slab_lock(page, &flags); + slab_err(s, slab, text, s->name); + slab_lock(slab, &flags); - map = get_map(s, page); - for_each_object(p, s, addr, page->objects) { + map = get_map(s, slab); + for_each_object(p, s, addr, slab->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { pr_err("Object 0x%p @offset=%tu\n", p, p - addr); @@ -4261,7 +4261,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } put_map(map); - slab_unlock(page, &flags); + slab_unlock(slab, &flags); #endif } @@ -4273,23 +4273,23 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); - struct page *page, *h; + struct slab *slab, *h; BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &n->partial, slab_list) { - if (!page->inuse) { - remove_partial(n, page); - list_add(&page->slab_list, &discard); + list_for_each_entry_safe(slab, h, &n->partial, slab_list) { + if (!slab->inuse) { + remove_partial(n, slab); + list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, page, + list_slab_objects(s, slab, "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &discard, slab_list) - discard_slab(s, page); + list_for_each_entry_safe(slab, h, &discard, slab_list) + discard_slab(s, slab); } bool __kmem_cache_empty(struct kmem_cache *s) @@ -4322,31 +4322,32 @@ int __kmem_cache_shutdown(struct kmem_cache *s) } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { void *base; int __maybe_unused i; unsigned int objnr; void *objp; void *objp0; - struct kmem_cache *s = page->slab_cache; + struct kmem_cache *s = slab->slab_cache; struct track __maybe_unused *trackp; kpp->kp_ptr = object; - kpp->kp_page = page; + kpp->kp_slab = slab; kpp->kp_slab_cache = s; - base = page_address(page); + base = slab_address(slab); objp0 = kasan_reset_tag(object); #ifdef CONFIG_SLUB_DEBUG objp = restore_red_left(s, objp0); #else objp = objp0; #endif - objnr = obj_to_index(s, page, objp); + objnr = obj_to_index(s, slab, objp); kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); objp = base + s->size * objnr; kpp->kp_objp = objp; - if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) || + if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size + || (objp - base) % s->size) || !(s->flags & SLAB_STORE_USER)) return; #ifdef CONFIG_SLUB_DEBUG @@ -4484,8 +4485,8 @@ EXPORT_SYMBOL(__kmalloc_node); * Returns NULL if check passes, otherwise const char * to name of cache * to indicate an error. */ -void __check_heap_object(const void *ptr, unsigned long n, struct page *page, - bool to_user) +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) { struct kmem_cache *s; unsigned int offset; @@ -4494,10 +4495,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, ptr = kasan_reset_tag(ptr); /* Find object and usable object size. */ - s = page->slab_cache; + s = slab->slab_cache; /* Reject impossible pointers. */ - if (ptr < page_address(page)) + if (ptr < slab_address(slab)) usercopy_abort("SLUB object not in SLUB page?!", NULL, to_user, 0, n); @@ -4505,7 +4506,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, if (is_kfence) offset = ptr - kfence_object_start(ptr); else - offset = (ptr - page_address(page)) % s->size; + offset = (ptr - slab_address(slab)) % s->size; /* Adjust for redzone and reject if within the redzone. */ if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { @@ -4527,25 +4528,24 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, size_t __ksize(const void *object) { - struct page *page; + struct folio *folio; if (unlikely(object == ZERO_SIZE_PTR)) return 0; - page = virt_to_head_page(object); + folio = virt_to_folio(object); - if (unlikely(!PageSlab(page))) { - WARN_ON(!PageCompound(page)); - return page_size(page); - } + if (unlikely(!folio_test_slab(folio))) + return folio_size(folio); - return slab_ksize(page->slab_cache); + return slab_ksize(folio_slab(folio)->slab_cache); } EXPORT_SYMBOL(__ksize); void kfree(const void *x) { - struct page *page; + struct folio *folio; + struct slab *slab; void *object = (void *)x; trace_kfree(_RET_IP_, x); @@ -4553,12 +4553,13 @@ void kfree(const void *x) if (unlikely(ZERO_OR_NULL_PTR(x))) return; - page = virt_to_head_page(x); - if (unlikely(!PageSlab(page))) { - free_nonslab_page(page, object); + folio = virt_to_folio(x); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, object); return; } - slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); + slab = folio_slab(folio); + slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -4578,8 +4579,8 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) int node; int i; struct kmem_cache_node *n; - struct page *page; - struct page *t; + struct slab *slab; + struct slab *t; struct list_head discard; struct list_head promote[SHRINK_PROMOTE_MAX]; unsigned long flags; @@ -4596,22 +4597,22 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) * Build lists of slabs to discard or promote. * * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. + * list_lock. slab->inuse here is the upper limit. */ - list_for_each_entry_safe(page, t, &n->partial, slab_list) { - int free = page->objects - page->inuse; + list_for_each_entry_safe(slab, t, &n->partial, slab_list) { + int free = slab->objects - slab->inuse; - /* Do not reread page->inuse */ + /* Do not reread slab->inuse */ barrier(); /* We do not keep full slabs on the list */ BUG_ON(free <= 0); - if (free == page->objects) { - list_move(&page->slab_list, &discard); + if (free == slab->objects) { + list_move(&slab->slab_list, &discard); n->nr_partial--; } else if (free <= SHRINK_PROMOTE_MAX) - list_move(&page->slab_list, promote + free - 1); + list_move(&slab->slab_list, promote + free - 1); } /* @@ -4624,8 +4625,8 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, &discard, slab_list) - discard_slab(s, page); + list_for_each_entry_safe(slab, t, &discard, slab_list) + discard_slab(s, slab); if (slabs_node(s, node)) ret = 1; @@ -4786,7 +4787,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) */ __flush_cpu_slab(s, smp_processor_id()); for_each_kmem_cache_node(s, node, n) { - struct page *p; + struct slab *p; list_for_each_entry(p, &n->partial, slab_list) p->slab_cache = s; @@ -4964,54 +4965,54 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif #ifdef CONFIG_SYSFS -static int count_inuse(struct page *page) +static int count_inuse(struct slab *slab) { - return page->inuse; + return slab->inuse; } -static int count_total(struct page *page) +static int count_total(struct slab *slab) { - return page->objects; + return slab->objects; } #endif #ifdef CONFIG_SLUB_DEBUG -static void validate_slab(struct kmem_cache *s, struct page *page, +static void validate_slab(struct kmem_cache *s, struct slab *slab, unsigned long *obj_map) { void *p; - void *addr = page_address(page); + void *addr = slab_address(slab); unsigned long flags; - slab_lock(page, &flags); + slab_lock(slab, &flags); - if (!check_slab(s, page) || !on_freelist(s, page, NULL)) + if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) goto unlock; /* Now we know that a valid freelist exists */ - __fill_map(obj_map, s, page); - for_each_object(p, s, addr, page->objects) { + __fill_map(obj_map, s, slab); + for_each_object(p, s, addr, slab->objects) { u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ? SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; - if (!check_object(s, page, p, val)) + if (!check_object(s, slab, p, val)) break; } unlock: - slab_unlock(page, &flags); + slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n, unsigned long *obj_map) { unsigned long count = 0; - struct page *page; + struct slab *slab; unsigned long flags; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, slab_list) { - validate_slab(s, page, obj_map); + list_for_each_entry(slab, &n->partial, slab_list) { + validate_slab(s, slab, obj_map); count++; } if (count != n->nr_partial) { @@ -5023,8 +5024,8 @@ static int validate_slab_node(struct kmem_cache *s, if (!(s->flags & SLAB_STORE_USER)) goto out; - list_for_each_entry(page, &n->full, slab_list) { - validate_slab(s, page, obj_map); + list_for_each_entry(slab, &n->full, slab_list) { + validate_slab(s, slab, obj_map); count++; } if (count != atomic_long_read(&n->nr_slabs)) { @@ -5190,15 +5191,15 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc, + struct slab *slab, enum track_item alloc, unsigned long *obj_map) { - void *addr = page_address(page); + void *addr = slab_address(slab); void *p; - __fill_map(obj_map, s, page); + __fill_map(obj_map, s, slab); - for_each_object(p, s, addr, page->objects) + for_each_object(p, s, addr, slab->objects) if (!test_bit(__obj_to_index(s, addr, p), obj_map)) add_location(t, s, get_track(s, p, alloc)); } @@ -5240,35 +5241,37 @@ static ssize_t show_slab_objects(struct kmem_cache *s, struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); int node; - struct page *page; + struct slab *slab; - page = READ_ONCE(c->page); - if (!page) + slab = READ_ONCE(c->slab); + if (!slab) continue; - node = page_to_nid(page); + node = slab_nid(slab); if (flags & SO_TOTAL) - x = page->objects; + x = slab->objects; else if (flags & SO_OBJECTS) - x = page->inuse; + x = slab->inuse; else x = 1; total += x; nodes[node] += x; - page = slub_percpu_partial_read_once(c); - if (page) { - node = page_to_nid(page); +#ifdef CONFIG_SLUB_CPU_PARTIAL + slab = slub_percpu_partial_read_once(c); + if (slab) { + node = slab_nid(slab); if (flags & SO_TOTAL) WARN_ON_ONCE(1); else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = page->pages; + x = slab->slabs; total += x; nodes[node] += x; } +#endif } } @@ -5467,33 +5470,35 @@ SLAB_ATTR_RO(objects_partial); static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { int objects = 0; - int pages = 0; - int cpu; + int slabs = 0; + int cpu __maybe_unused; int len = 0; +#ifdef CONFIG_SLUB_CPU_PARTIAL for_each_online_cpu(cpu) { - struct page *page; + struct slab *slab; - page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); + slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (page) - pages += page->pages; + if (slab) + slabs += slab->slabs; } +#endif - /* Approximate half-full pages , see slub_set_cpu_partial() */ - objects = (pages * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages); + /* Approximate half-full slabs, see slub_set_cpu_partial() */ + objects = (slabs * oo_objects(s->oo)) / 2; + len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); -#ifdef CONFIG_SMP +#if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP) for_each_online_cpu(cpu) { - struct page *page; + struct slab *slab; - page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (page) { - pages = READ_ONCE(page->pages); - objects = (pages * oo_objects(s->oo)) / 2; + slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); + if (slab) { + slabs = READ_ONCE(slab->slabs); + objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, objects, pages); + cpu, objects, slabs); } } #endif @@ -6161,16 +6166,16 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) for_each_kmem_cache_node(s, node, n) { unsigned long flags; - struct page *page; + struct slab *slab; if (!atomic_long_read(&n->nr_slabs)) continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, slab_list) - process_slab(t, s, page, alloc, obj_map); - list_for_each_entry(page, &n->full, slab_list) - process_slab(t, s, page, alloc, obj_map); + list_for_each_entry(slab, &n->partial, slab_list) + process_slab(t, s, slab, alloc, obj_map); + list_for_each_entry(slab, &n->full, slab_list) + process_slab(t, s, slab, alloc, obj_map); spin_unlock_irqrestore(&n->list_lock, flags); } diff --git a/mm/sparse.c b/mm/sparse.c index e5c84b0cf0c9..d21c6e5910d0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -722,7 +722,7 @@ static void free_map_bootmem(struct page *memmap) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = (unsigned long) page->freelist; + magic = page->index; BUG_ON(magic == NODE_INFO); diff --git a/mm/swap.c b/mm/swap.c index b461814ce0cb..bcf3ac288b56 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1077,24 +1077,24 @@ void __pagevec_lru_add(struct pagevec *pvec) } /** - * pagevec_remove_exceptionals - pagevec exceptionals pruning - * @pvec: The pagevec to prune + * folio_batch_remove_exceptionals() - Prune non-folios from a batch. + * @fbatch: The batch to prune * - * find_get_entries() fills both pages and XArray value entries (aka - * exceptional entries) into the pagevec. This function prunes all - * exceptionals from @pvec without leaving holes, so that it can be - * passed on to page-only pagevec operations. + * find_get_entries() fills a batch with both folios and shadow/swap/DAX + * entries. This function prunes all the non-folio entries from @fbatch + * without leaving holes, so that it can be passed on to folio-only batch + * operations. */ -void pagevec_remove_exceptionals(struct pagevec *pvec) +void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { - int i, j; + unsigned int i, j; - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!xa_is_value(page)) - pvec->pages[j++] = page; + for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + if (!xa_is_value(folio)) + fbatch->folios[j++] = folio; } - pvec->nr = j; + fbatch->nr = j; } /** diff --git a/mm/truncate.c b/mm/truncate.c index 41b8249b3b4a..5e243d7269c0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -56,11 +56,11 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, /* * Unconditionally remove exceptional entries. Usually called from truncate - * path. Note that the pagevec may be altered by this function by removing - * exceptional entries similar to what pagevec_remove_exceptionals does. + * path. Note that the folio_batch may be altered by this function by removing + * exceptional entries similar to what folio_batch_remove_exceptionals() does. */ -static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices) +static void truncate_folio_batch_exceptionals(struct address_space *mapping, + struct folio_batch *fbatch, pgoff_t *indices) { int i, j; bool dax; @@ -69,11 +69,11 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, if (shmem_mapping(mapping)) return; - for (j = 0; j < pagevec_count(pvec); j++) - if (xa_is_value(pvec->pages[j])) + for (j = 0; j < folio_batch_count(fbatch); j++) + if (xa_is_value(fbatch->folios[j])) break; - if (j == pagevec_count(pvec)) + if (j == folio_batch_count(fbatch)) return; dax = dax_mapping(mapping); @@ -82,12 +82,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, xa_lock_irq(&mapping->i_pages); } - for (i = j; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; + for (i = j; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; pgoff_t index = indices[i]; - if (!xa_is_value(page)) { - pvec->pages[j++] = page; + if (!xa_is_value(folio)) { + fbatch->folios[j++] = folio; continue; } @@ -96,7 +96,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, continue; } - __clear_shadow_entry(mapping, index, page); + __clear_shadow_entry(mapping, index, folio); } if (!dax) { @@ -105,7 +105,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } - pvec->nr = j; + fbatch->nr = j; } /* @@ -177,21 +177,21 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void truncate_cleanup_page(struct page *page) +static void truncate_cleanup_folio(struct folio *folio) { - if (page_mapped(page)) - unmap_mapping_page(page); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); - if (page_has_private(page)) - do_invalidatepage(page, 0, thp_size(page)); + if (folio_has_private(folio)) + do_invalidatepage(&folio->page, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ - cancel_dirty_page(page); - ClearPageMappedToDisk(page); + folio_cancel_dirty(folio); + folio_clear_mappedtodisk(folio); } /* @@ -215,23 +215,75 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return remove_mapping(mapping, page); } -int truncate_inode_page(struct address_space *mapping, struct page *page) +int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); - - if (page->mapping != mapping) + if (folio->mapping != mapping) return -EIO; - truncate_cleanup_page(page); - delete_from_page_cache(page); + truncate_cleanup_folio(folio); + filemap_remove_folio(folio); return 0; } /* + * Handle partial folios. The folio may be entirely within the + * range if a split has raced with us. If not, we zero the part of the + * folio that's within the [start, end] range, and then split the folio if + * it's large. split_page_range() will discard pages which now lie beyond + * i_size, and we rely on the caller to discard pages which lie within a + * newly created hole. + * + * Returns false if splitting failed so the caller can avoid + * discarding the entire folio which is stubbornly unsplit. + */ +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) +{ + loff_t pos = folio_pos(folio); + unsigned int offset, length; + + if (pos < start) + offset = start - pos; + else + offset = 0; + length = folio_size(folio); + if (pos + length <= (u64)end) + length = length - offset; + else + length = end + 1 - pos - offset; + + folio_wait_writeback(folio); + if (length == folio_size(folio)) { + truncate_inode_folio(folio->mapping, folio); + return true; + } + + /* + * We may be zeroing pages we're about to discard, but it avoids + * doing a complex calculation here, and then doing the zeroing + * anyway if the page split fails. + */ + folio_zero_range(folio, offset, length); + + cleancache_invalidate_page(folio->mapping, &folio->page); + if (folio_has_private(folio)) + do_invalidatepage(&folio->page, offset, length); + if (!folio_test_large(folio)) + return true; + if (split_huge_page(&folio->page) == 0) + return true; + if (folio_test_dirty(folio)) + return false; + truncate_inode_folio(folio->mapping, folio); + return true; +} + +/* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_page(struct address_space *mapping, struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (!mapping) return -EINVAL; /* @@ -240,7 +292,7 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_page(mapping, page); + return truncate_inode_folio(mapping, page_folio(page)); } EXPORT_SYMBOL(generic_error_remove_page); @@ -291,20 +343,16 @@ void truncate_inode_pages_range(struct address_space *mapping, { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ - unsigned int partial_start; /* inclusive */ - unsigned int partial_end; /* exclusive */ - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; + struct folio *folio; + bool same_folio; if (mapping_empty(mapping)) goto out; - /* Offsets within partial pages */ - partial_start = lstart & (PAGE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_SIZE - 1); - /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the @@ -322,64 +370,49 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, - &pvec, indices)) { - index = indices[pagevec_count(&pvec) - 1] + 1; - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(pvec.pages[i]); - delete_from_page_cache_batch(mapping, &pvec); - for (i = 0; i < pagevec_count(&pvec); i++) - unlock_page(pvec.pages[i]); - pagevec_release(&pvec); + &fbatch, indices)) { + index = indices[folio_batch_count(&fbatch) - 1] + 1; + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + for (i = 0; i < folio_batch_count(&fbatch); i++) + truncate_cleanup_folio(fbatch.folios[i]); + delete_from_page_cache_batch(mapping, &fbatch); + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_unlock(fbatch.folios[i]); + folio_batch_release(&fbatch); cond_resched(); } - if (partial_start) { - struct page *page = find_lock_page(mapping, start - 1); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - /* Truncation within a single page */ - top = partial_end; - partial_end = 0; - } - wait_on_page_writeback(page); - zero_user_segment(page, partial_start, top); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial_start, - top - partial_start); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = find_lock_page(mapping, end); - if (page) { - wait_on_page_writeback(page); - zero_user_segment(page, 0, partial_end); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, 0, - partial_end); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, + FGP_LOCK, 0); + if (folio) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - /* - * If the truncation happened within a single page no pages - * will be released, just zeroed, so we can bail out now. - */ - if (start >= end) - goto out; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &pvec, + if (!find_get_entries(mapping, index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -389,23 +422,24 @@ void truncate_inode_pages_range(struct address_space *mapping, continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - lock_page(page); - WARN_ON(page_to_index(page) != index); - wait_on_page_writeback(page); - truncate_inode_page(mapping, page); - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + folio_wait_writeback(folio); + truncate_inode_folio(mapping, folio); + folio_unlock(folio); + index = folio_index(folio) + folio_nr_pages(folio) - 1; } - truncate_exceptional_pvec_entries(mapping, &pvec, indices); - pagevec_release(&pvec); + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + folio_batch_release(&fbatch); index++; } @@ -476,16 +510,16 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; - pagevec_init(&pvec); - while (find_lock_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + folio_batch_init(&fbatch); + while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct page *page = &fbatch.folios[i]->page; /* We rely upon deletion not changing page->index */ index = indices[i]; @@ -512,8 +546,8 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, } count += ret; } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } @@ -565,31 +599,29 @@ void invalidate_mapping_pagevec(struct address_space *mapping, * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ -static int -invalidate_complete_page2(struct address_space *mapping, struct page *page) +static int invalidate_complete_folio2(struct address_space *mapping, + struct folio *folio) { - if (page->mapping != mapping) + if (folio->mapping != mapping) return 0; - if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto failed; - BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); + BUG_ON(folio_has_private(folio)); + __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(page); - - put_page(page); /* pagecache ref */ + filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); @@ -597,13 +629,13 @@ failed: return 0; } -static int do_launder_page(struct address_space *mapping, struct page *page) +static int do_launder_folio(struct address_space *mapping, struct folio *folio) { - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) return 0; - if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL) return 0; - return mapping->a_ops->launder_page(page); + return mapping->a_ops->launder_page(&folio->page); } /** @@ -621,7 +653,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; @@ -631,25 +663,25 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (mapping_empty(mapping)) goto out; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &pvec, indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (find_get_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ index = indices[i]; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, page)) + index, folio)) ret = -EBUSY; continue; } - if (!did_range_unmap && page_mapped(page)) { + if (!did_range_unmap && folio_mapped(folio)) { /* - * If page is mapped, before taking its lock, + * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, index, @@ -657,29 +689,29 @@ int invalidate_inode_pages2_range(struct address_space *mapping, did_range_unmap = 1; } - lock_page(page); - WARN_ON(page_to_index(page) != index); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + if (folio->mapping != mapping) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (page_mapped(page)) - unmap_mapping_page(page); - BUG_ON(page_mapped(page)); + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); - ret2 = do_launder_page(mapping, page); + ret2 = do_launder_folio(mapping, folio); if (ret2 == 0) { - if (!invalidate_complete_page2(mapping, page)) + if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; - unlock_page(page); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } diff --git a/mm/usercopy.c b/mm/usercopy.c index b3de3c4eefba..d0d268135d96 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -20,6 +20,7 @@ #include <linux/atomic.h> #include <linux/jump_label.h> #include <asm/sections.h> +#include "slab.h" /* * Checks if a given pointer and length is contained by the current @@ -223,7 +224,7 @@ static inline void check_page_span(const void *ptr, unsigned long n, static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { - struct page *page; + struct folio *folio; if (!virt_addr_valid(ptr)) return; @@ -231,16 +232,16 @@ static inline void check_heap_object(const void *ptr, unsigned long n, /* * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the * highmem page or fallback to virt_to_page(). The following - * is effectively a highmem-aware virt_to_head_page(). + * is effectively a highmem-aware virt_to_slab(). */ - page = compound_head(kmap_to_page((void *)ptr)); + folio = page_folio(kmap_to_page((void *)ptr)); - if (PageSlab(page)) { + if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ - __check_heap_object(ptr, n, page, to_user); + __check_heap_object(ptr, n, folio_slab(folio), to_user); } else { /* Verify object does not incorrectly span multiple pages. */ - check_page_span(ptr, n, page, to_user); + check_page_span(ptr, n, folio_page(folio, 0), to_user); } } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b897ce3b399a..0d3b65939016 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -17,10 +17,10 @@ * * Usage of struct page fields: * page->private: points to zspage - * page->freelist(index): links together all component pages of a zspage + * page->index: links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. - * page->units: first object offset in a subpage of zspage + * page->page_type: first object offset in a subpage of zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -489,12 +489,12 @@ static inline struct page *get_first_page(struct zspage *zspage) static inline int get_first_obj_offset(struct page *page) { - return page->units; + return page->page_type; } static inline void set_first_obj_offset(struct page *page, int offset) { - page->units = offset; + page->page_type = offset; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -827,7 +827,7 @@ static struct page *get_next_page(struct page *page) if (unlikely(PageHugeObject(page))) return NULL; - return page->freelist; + return (struct page *)page->index; } /** @@ -901,7 +901,7 @@ static void reset_page(struct page *page) set_page_private(page, 0); page_mapcount_reset(page); ClearPageHugeObject(page); - page->freelist = NULL; + page->index = 0; } static int trylock_zspage(struct zspage *zspage) @@ -1027,7 +1027,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, /* * Allocate individual pages and link them together as: - * 1. all pages are linked together using page->freelist + * 1. all pages are linked together using page->index * 2. each sub-page point to zspage using page->private * * we set PG_private to identify the first page (i.e. no other sub-page @@ -1036,7 +1036,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, for (i = 0; i < nr_pages; i++) { page = pages[i]; set_page_private(page, (unsigned long)zspage); - page->freelist = NULL; + page->index = 0; if (i == 0) { zspage->first_page = page; SetPagePrivate(page); @@ -1044,7 +1044,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, class->pages_per_zspage == 1)) SetPageHugeObject(page); } else { - prev_page->freelist = page; + prev_page->index = (unsigned long)page; } prev_page = page; } |