From aa464ba9a1e444d5ef95bb63ee3b2ef26fc96ed7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Feb 2023 17:18:34 +1000 Subject: lazy tlb: introduce lazy tlb mm refcount helper functions Add explicit _lazy_tlb annotated functions for lazy tlb mm refcounting. This makes the lazy tlb mm references more obvious, and allows the refcounting scheme to be modified in later changes. There is no functional change with this patch. Link: https://lkml.kernel.org/r/20230203071837.1136453-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Michael Ellerman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 7c44d0c65b1b..87cf3a2f0e9a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1034,7 +1034,7 @@ static int exec_mmap(struct mm_struct *mm) mmput(old_mm); return 0; } - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); return 0; } -- cgit From 8b8d9a2d328164083b84e5b2515fa4d040bcdcdd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:31:23 +0100 Subject: ufs: don't flush page immediately for DIRSYNC directories Patch series "remove most callers of write_one_page", v4. This series removes most users of the write_one_page API. These helpers internally call ->writepage which we are gradually removing from the kernel. This patch (of 3): We do not need to writeout modified directory blocks immediately when modifying them while the page is locked. It is enough to do the flush somewhat later which has the added benefit that inode times can be flushed as well. It also allows us to stop depending on write_one_page() function. Ported from an ext2 patch by Jan Kara. Link: https://lkml.kernel.org/r/20230307143125.27778-1-hch@lst.de Link: https://lkml.kernel.org/r/20230307143125.27778-2-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dave Kleikamp Cc: Evgeniy Dushistov Cc: Jan Kara via Ocfs2-devel Cc: Joel Becker Cc: Matthew Wilcox (Oracle) Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Jan Kara Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ufs/dir.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 391efaf1d528..379d75796a5c 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -42,11 +42,10 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); @@ -54,10 +53,16 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) i_size_write(dir, pos+len); mark_inode_dirty(dir); } - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); + unlock_page(page); +} + +static int ufs_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); return err; } @@ -99,11 +104,12 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - err = ufs_commit_chunk(page, pos, len); + ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + ufs_handle_dirsync(dir); } @@ -390,10 +396,11 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - err = ufs_commit_chunk(page, pos, rec_len); + ufs_commit_chunk(page, pos, rec_len); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: ufs_put_page(page); @@ -531,9 +538,10 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - err = ufs_commit_chunk(page, pos, to - from); + ufs_commit_chunk(page, pos, to - from); inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); + err = ufs_handle_dirsync(inode); out: ufs_put_page(page); UFSD("EXIT\n"); @@ -579,7 +587,8 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) strcpy (de->d_name, ".."); kunmap(page); - err = ufs_commit_chunk(page, 0, chunk_size); + ufs_commit_chunk(page, 0, chunk_size); + err = ufs_handle_dirsync(inode); fail: put_page(page); return err; -- cgit From a0d50b11bff644a6e4339874ba48f4bc02b842bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:31:24 +0100 Subject: ocfs2: don't use write_one_page in ocfs2_duplicate_clusters_by_page Use filemap_write_and_wait_range to write back the range of the dirty page instead of write_one_page in preparation of removing write_one_page and eventually ->writepage. Link: https://lkml.kernel.org/r/20230307143125.27778-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Dave Kleikamp Cc: Evgeniy Dushistov Cc: Gang He Cc: Jan Kara via Ocfs2-devel Cc: Joel Becker Cc: Joseph Qi Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/ocfs2/refcounttree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5a656dc683f1..564ab48d03ef 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2952,10 +2952,11 @@ retry: */ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { if (PageDirty(page)) { - /* - * write_on_page will unlock the page on return - */ - ret = write_one_page(page); + unlock_page(page); + put_page(page); + + ret = filemap_write_and_wait_range(mapping, + offset, map_end - 1); goto retry; } } -- cgit From 452a8f40728065800b5a5b81f1152e9a16d39656 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:31:25 +0100 Subject: mm,jfs: move write_one_page/folio_write_one to jfs The last remaining user of folio_write_one through the write_one_page wrapper is jfs, so move the functionality there and hard code the call to metapage_writepage. Note that the use of the pagecache by the JFS 'metapage' buffer cache is a bit odd, and we could probably do without VM-level dirty tracking at all, but that's a change for another time. Link: https://lkml.kernel.org/r/20230307143125.27778-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Dave Kleikamp Cc: Changwei Ge Cc: Evgeniy Dushistov Cc: Gang He Cc: Jan Kara Cc: Jan Kara via Ocfs2-devel Cc: Joel Becker Cc: Joseph Qi Cc: Joseph Qi Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/jfs/jfs_metapage.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 2e8461ce74de..961569c11159 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -691,6 +691,35 @@ void grab_metapage(struct metapage * mp) unlock_page(mp->page); } +static int metapage_write_one(struct page *page) +{ + struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = folio_nr_pages(folio), + }; + int ret = 0; + + BUG_ON(!folio_test_locked(folio)); + + folio_wait_writeback(folio); + + if (folio_clear_dirty_for_io(folio)) { + folio_get(folio); + ret = metapage_writepage(page, &wbc); + if (ret == 0) + folio_wait_writeback(folio); + folio_put(folio); + } else { + folio_unlock(folio); + } + + if (!ret) + ret = filemap_check_errors(mapping); + return ret; +} + void force_metapage(struct metapage *mp) { struct page *page = mp->page; @@ -700,8 +729,8 @@ void force_metapage(struct metapage *mp) get_page(page); lock_page(page); set_page_dirty(page); - if (write_one_page(page)) - jfs_error(mp->sb, "write_one_page() failed\n"); + if (metapage_write_one(page)) + jfs_error(mp->sb, "metapage_write_one() failed\n"); clear_bit(META_forcewrite, &mp->flag); put_page(page); } @@ -746,9 +775,9 @@ void release_metapage(struct metapage * mp) set_page_dirty(page); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); - if (write_one_page(page)) - jfs_error(mp->sb, "write_one_page() failed\n"); - lock_page(page); /* write_one_page unlocks the page */ + if (metapage_write_one(page)) + jfs_error(mp->sb, "metapage_write_one() failed\n"); + lock_page(page); } } else if (mp->lsn) /* discard_metapage doesn't remove it */ remove_from_logsync(mp); -- cgit From cf2e309ebca7bb0916771839f9b580b06c778530 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 13 Mar 2023 19:28:19 +0800 Subject: mm: shrinkers: convert shrinker_rwsem to mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now there are no readers of shrinker_rwsem, so we can simply replace it with mutex lock. Link: https://lkml.kernel.org/r/20230313112819.38938-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Cc: Christian König Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Paul E. McKenney Cc: Shakeel Butt Cc: Sultan Alsawaf Cc: Tetsuo Handa Cc: Yang Shi Signed-off-by: Andrew Morton --- fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 04bc62ab7dfe..34afe411cf2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, -- cgit From 66dabbb65d673aef40dd17bf62c042be8f6d4a4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2023 15:34:10 +0100 Subject: mm: return an ERR_PTR from __filemap_get_folio Instead of returning NULL for all errors, distinguish between: - no entry found and not asked to allocated (-ENOENT) - failed to allocate memory (-ENOMEM) - would block (-EAGAIN) so that callers don't have to guess the error based on the passed in flags. Also pass through the error through the direct callers: filemap_get_folio, filemap_lock_folio filemap_grab_folio and filemap_get_incore_folio. [hch@lst.de: fix null-pointer deref] Link: https://lkml.kernel.org/r/20230310070023.GA13563@lst.de Link: https://lkml.kernel.org/r/20230310043137.GA1624890@u2004 Link: https://lkml.kernel.org/r/20230307143410.28031-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Ryusuke Konishi [nilfs2] Cc: Andreas Gruenbacher Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/afs/dir.c | 10 +++++----- fs/afs/dir_edit.c | 2 +- fs/afs/write.c | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/move_extent.c | 8 ++++---- fs/hugetlbfs/inode.c | 2 +- fs/iomap/buffered-io.c | 11 ++--------- fs/netfs/buffered_read.c | 4 ++-- fs/nfs/file.c | 4 ++-- fs/nilfs2/page.c | 6 +++--- 10 files changed, 23 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 82690d1dd49a..f92b9e62d567 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -319,16 +319,16 @@ expand: struct folio *folio; folio = filemap_get_folio(mapping, i); - if (!folio) { + if (IS_ERR(folio)) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); - - ret = -ENOMEM; folio = __filemap_get_folio(mapping, i, FGP_LOCK | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto error; + } folio_attach_private(folio, (void *)1); folio_unlock(folio); } @@ -524,7 +524,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, FGP_ACCESSED, 0); - if (!folio) { + if (IS_ERR(folio)) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 0ab7752d1b75..f0eddccbdd95 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); else if (folio && !folio_test_private(folio)) folio_attach_private(folio, (void *)1); diff --git a/fs/afs/write.c b/fs/afs/write.c index 571f3b9a417e..c822d6006033 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping, _debug("kill %lx (to %lx)", index, last); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } @@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, _debug("redirty %llx @%llx", len, start); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..d7973743417b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5395,7 +5395,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!folio) + if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2de9829aed63..7bf6d069199c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -141,18 +141,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, mapping_gfp_mask(mapping[0])); - if (!folio[0]) { + if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); - return -ENOMEM; + return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!folio[1]) { + if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); - return -ENOMEM; + return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9062da6da567..702d79639c0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, struct folio *folio; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6f4c97a6d7e9..96bb56c203f4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) { unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; - struct folio *folio; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (folio) - return folio; - - if (iter->flags & IOMAP_NOWAIT) - return ERR_PTR(-EAGAIN); - return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(iomap_get_folio); @@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, start_byte >> PAGE_SHIFT); - if (!folio) { + if (IS_ERR(folio)) { start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + PAGE_SIZE; continue; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7679a68e8193..209726a9cfdb 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -350,8 +350,8 @@ int netfs_write_begin(struct netfs_inode *ctx, retry: folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 893625eacab9..1d03406e6c03 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -336,8 +336,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, start: folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); *pagep = &folio->page; ret = nfs_flush_incompatible(file, folio); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 41ccd43cd979..5cf30827f244 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -259,10 +259,10 @@ repeat: NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); - if (unlikely(!dfolio)) { + if (unlikely(IS_ERR(dfolio))) { /* No empty page is added to the page cache */ - err = -ENOMEM; folio_unlock(folio); + err = PTR_ERR(dfolio); break; } if (unlikely(!folio_buffers(folio))) @@ -311,7 +311,7 @@ repeat: folio_lock(folio); dfolio = filemap_lock_folio(dmap, index); - if (dfolio) { + if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); nilfs_copy_page(&dfolio->page, &folio->page, 0); -- cgit From 2bad466cc9d9b4c3b4b16eb9c03c919b59561316 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Mar 2023 17:37:10 -0500 Subject: mm/uffd: UFFD_FEATURE_WP_UNPOPULATED Patch series "mm/uffd: Add feature bit UFFD_FEATURE_WP_UNPOPULATED", v4. The new feature bit makes anonymous memory acts the same as file memory on userfaultfd-wp in that it'll also wr-protect none ptes. It can be useful in two cases: (1) Uffd-wp app that needs to wr-protect none ptes like QEMU snapshot, so pre-fault can be replaced by enabling this flag and speed up protections (2) It helps to implement async uffd-wp mode that Muhammad is working on [1] It's debatable whether this is the most ideal solution because with the new feature bit set, wr-protect none pte needs to pre-populate the pgtables to the last level (PAGE_SIZE). But it seems fine so far to service either purpose above, so we can leave optimizations for later. The series brings pte markers to anonymous memory too. There's some change in the common mm code path in the 1st patch, great to have some eye looking at it, but hopefully they're still relatively straightforward. This patch (of 2): This is a new feature that controls how uffd-wp handles none ptes. When it's set, the kernel will handle anonymous memory the same way as file memory, by allowing the user to wr-protect unpopulated ptes. File memories handles none ptes consistently by allowing wr-protecting of none ptes because of the unawareness of page cache being exist or not. For anonymous it was not as persistent because we used to assume that we don't need protections on none ptes or known zero pages. One use case of such a feature bit was VM live snapshot, where if without wr-protecting empty ptes the snapshot can contain random rubbish in the holes of the anonymous memory, which can cause misbehave of the guest when the guest OS assumes the pages should be all zeros. QEMU worked it around by pre-populate the section with reads to fill in zero page entries before starting the whole snapshot process [1]. Recently there's another need raised on using userfaultfd wr-protect for detecting dirty pages (to replace soft-dirty in some cases) [2]. In that case if without being able to wr-protect none ptes by default, the dirty info can get lost, since we cannot treat every none pte to be dirty (the current design is identify a page dirty based on uffd-wp bit being cleared). In general, we want to be able to wr-protect empty ptes too even for anonymous. This patch implements UFFD_FEATURE_WP_UNPOPULATED so that it'll make uffd-wp handling on none ptes being consistent no matter what the memory type is underneath. It doesn't have any impact on file memories so far because we already have pte markers taking care of that. So it only affects anonymous. The feature bit is by default off, so the old behavior will be maintained. Sometimes it may be wanted because the wr-protect of none ptes will contain overheads not only during UFFDIO_WRITEPROTECT (by applying pte markers to anonymous), but also on creating the pgtables to store the pte markers. So there's potentially less chance of using thp on the first fault for a none pmd or larger than a pmd. The major implementation part is teaching the whole kernel to understand pte markers even for anonymously mapped ranges, meanwhile allowing the UFFDIO_WRITEPROTECT ioctl to apply pte markers for anonymous too when the new feature bit is set. Note that even if the patch subject starts with mm/uffd, there're a few small refactors to major mm path of handling anonymous page faults. But they should be straightforward. With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all the memory before wr-protect during taking a live snapshot. Quotting from Muhammad's test result here [3] based on a simple program [4]: (1) With huge page disabled echo madvise > /sys/kernel/mm/transparent_hugepage/enabled ./uffd_wp_perf Test DEFAULT: 4 Test PRE-READ: 1111453 (pre-fault 1101011) Test MADVISE: 278276 (pre-fault 266378) Test WP-UNPOPULATE: 11712 (2) With Huge page enabled echo always > /sys/kernel/mm/transparent_hugepage/enabled ./uffd_wp_perf Test DEFAULT: 4 Test PRE-READ: 22521 (pre-fault 22348) Test MADVISE: 4909 (pre-fault 4743) Test WP-UNPOPULATE: 14448 There'll be a great perf boost for no-thp case, while for thp enabled with extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE, but that's low possibility in reality, also the overhead was not reduced but postponed until a follow up write on any huge zero thp, so potentially it is faster by making the follow up writes slower. [1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@virtuozzo.com/ [2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/ [3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@collabora.com/ [4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c [peterx@redhat.com: comment changes, oneliner fix to khugepaged] Link: https://lkml.kernel.org/r/ZB2/8jPhD3fpx5U8@x1n Link: https://lkml.kernel.org/r/20230309223711.823547-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20230309223711.823547-2-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Muhammad Usama Anjum Cc: Nadav Amit Cc: Paul Gofman Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 44d1ee429eb0..881e9c82b9d1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +/* + * Whether WP_UNPOPULATED is enabled on the uffd context. It is only + * meaningful when userfaultfd_wp()==true on the vma and when it's + * anonymous. + */ +bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return false; + + return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t flags) { @@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #endif #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; -- cgit From 23baf831a32c04f9a968812511540b1b3e648bf5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Mar 2023 14:31:33 +0300 Subject: mm, treewide: redefine MAX_ORDER sanely MAX_ORDER currently defined as number of orders page allocator supports: user can ask buddy allocator for page order between 0 and MAX_ORDER-1. This definition is counter-intuitive and lead to number of bugs all over the kernel. Change the definition of MAX_ORDER to be inclusive: the range of orders user can ask from buddy allocator is 0..MAX_ORDER now. [kirill@shutemov.name: fix min() warning] Link: https://lkml.kernel.org/r/20230315153800.32wib3n5rickolvh@box [akpm@linux-foundation.org: fix another min_t warning] [kirill@shutemov.name: fixups per Zi Yan] Link: https://lkml.kernel.org/r/20230316232144.b7ic4cif4kjiabws@box.shutemov.name [akpm@linux-foundation.org: fix underlining in docs] Link: https://lore.kernel.org/oe-kbuild-all/202303191025.VRCTk6mP-lkp@intel.com/ Link: https://lkml.kernel.org/r/20230315113133.11326-11-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Michael Ellerman [powerpc] Cc: "Kirill A. Shutemov" Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/ramfs/file-nommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2f67516bb9bf..9fbb9b5256f7 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); -- cgit From a734991ccaec1985fff42fb26bb6d789d35defb4 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:47 -0700 Subject: mm: userfaultfd: rename functions for clarity + consistency Patch series "mm: userfaultfd: refactor and add UFFDIO_CONTINUE_MODE_WP", v5. - Commits 1-3 refactor userfaultfd ioctl code without behavior changes, with the main goal of improving consistency and reducing the number of function args. - Commit 4 adds UFFDIO_CONTINUE_MODE_WP. This patch (of 4): The basic problem is, over time we've added new userfaultfd ioctls, and we've refactored the code so functions which used to handle only one case are now re-used to deal with several cases. While this happened, we didn't bother to rename the functions. Similarly, as we added new functions, we cargo-culted pieces of the now-inconsistent naming scheme, so those functions too ended up with names that don't make a lot of sense. A key point here is, "copy" in most userfaultfd code refers specifically to UFFDIO_COPY, where we allocate a new page and copy its contents from userspace. There are many functions with "copy" in the name that don't actually do this (at least in some cases). So, rename things into a consistent scheme. The high level idea is that the call stack for userfaultfd ioctls becomes: userfaultfd_ioctl -> userfaultfd_(particular ioctl) -> mfill_atomic_(particular kind of fill operation) -> mfill_atomic /* loops over pages in range */ -> mfill_atomic_pte /* deals with single pages */ -> mfill_atomic_pte_(particular kind of fill operation) -> mfill_atomic_install_pte There are of course some special cases (shmem, hugetlb), but this is the general structure which all function names now adhere to. Link: https://lkml.kernel.org/r/20230314221250.682452-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230314221250.682452-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: James Houghton Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 881e9c82b9d1..4aedfd98e3f5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1756,9 +1756,9 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1808,9 +1808,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1918,9 +1918,9 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; -- cgit From 61c5004022f56c443b86800e8985d8803f3a22aa Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:48 -0700 Subject: mm: userfaultfd: don't pass around both mm and vma Quite a few userfaultfd functions took both mm and vma pointers as arguments. Since the mm is trivially accessible via vma->vm_mm, there's no reason to pass both; it just needlessly extends the already long argument list. Get rid of the mm pointer, where possible, to shorten the argument list. Link: https://lkml.kernel.org/r/20230314221250.682452-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: James Houghton Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4aedfd98e3f5..d8d432ca81e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1644,7 +1644,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) - uffd_wp_range(mm, vma, start, vma_end - start, false); + uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, -- cgit From d9712937037e0ce887920f321429826e9dbfd960 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:49 -0700 Subject: mm: userfaultfd: combine 'mode' and 'wp_copy' arguments Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy' argument. In future commits we plan to plumb the flags through to more places, so we'd be proliferating the very long argument list even further. Let's take the time to simplify the argument list. Combine the two arguments into one - and generalize, so when we add more flags in the future, it doesn't imply more function arguments. Since the modes (copy, zeropage, continue) are mutually exclusive, store them as an integer value (0, 1, 2) in the low bits. Place combine-able flag bits in the high bits. This is quite similar to an earlier patch proposed by Nadav Amit ("userfaultfd: introduce uffd_flags" [1]). The main difference is that patch only handled flags, whereas this patch *also* combines the "mode" argument into the same type to shorten the argument list. [1]: https://lore.kernel.org/all/20220619233449.181323-2-namit@vmware.com/ Link: https://lkml.kernel.org/r/20230314221250.682452-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: James Houghton Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Shuah Khan Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d8d432ca81e6..8971c3613cc6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1729,6 +1729,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_copy = (struct uffdio_copy __user *) arg; @@ -1755,10 +1756,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + flags); mmput(ctx->mm); } else { return -ESRCH; -- cgit From 0289184476c845968ad6ac9083c96cc0f75ca505 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 14 Mar 2023 15:12:50 -0700 Subject: mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE to resolve a missing fault, one can install a write-protected one. This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination. This was motivated by testing HugeTLB HGM [1], and in particular its interaction with userfaultfd features. Existing userfaultfd code supports using WP and MINOR modes together (i.e. you can register an area with both enabled), but without this CONTINUE flag the combination is in practice unusable. So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as UFFDIO_COPY_MODE_WP, but for *minor* faults. Update the selftest to do some very basic exercising of the new flag. Update Documentation/ to describe how these flags are used (neither the COPY nor the new CONTINUE versions of this mode flag were described there before). [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/ Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Mike Rapoport (IBM) Cc: Al Viro Cc: Hugh Dickins Cc: Jan Kara Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Shuah Khan Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8971c3613cc6..8395605790f6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, uffdio_continue.range.len, - &ctx->mmap_changing); + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; -- cgit From bd23024b9774e681cbe6cc3afcb24244dfcb2390 Mon Sep 17 00:00:00 2001 From: Tomas Mudrunka Date: Tue, 21 Mar 2023 11:34:30 +0100 Subject: mm/memtest: add results of early memtest to /proc/meminfo Currently the memtest results were only presented in dmesg. When running a large fleet of devices without ECC RAM it's currently not easy to do bulk monitoring for memory corruption. You have to parse dmesg, but that's a ring buffer so the error might disappear after some time. In general I do not consider dmesg to be a great API to query RAM status. In several companies I've seen such errors remain undetected and cause issues for way too long. So I think it makes sense to provide a monitoring API, so that we can safely detect and act upon them. This adds /proc/meminfo entry which can be easily used by scripts. Link: https://lkml.kernel.org/r/20230321103430.7130-1-tomas.mudrunka@gmail.com Signed-off-by: Tomas Mudrunka Cc: Jonathan Corbet Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 440960110a42..b43d0bd42762 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); +#ifdef CONFIG_MEMTEST + if (early_memtest_done) { + unsigned long early_memtest_bad_size_kb; + + early_memtest_bad_size_kb = early_memtest_bad_size>>10; + if (early_memtest_bad_size && !early_memtest_bad_size_kb) + early_memtest_bad_size_kb = 1; + /* When 0 is reported, it means there actually was a successful test */ + seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); + } +#endif + #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); -- cgit From 2e1c0170771e6bf31bc785ea43a44e6e85e36268 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 18:57:01 +0000 Subject: fs/proc/kcore: avoid bounce buffer for ktext data Patch series "convert read_kcore(), vread() to use iterators", v8. While reviewing Baoquan's recent changes to permit vread() access to vm_map_ram regions of vmalloc allocations, Willy pointed out [1] that it would be nice to refactor vread() as a whole, since its only user is read_kcore() and the existing form of vread() necessitates the use of a bounce buffer. This patch series does exactly that, as well as adjusting how we read the kernel text section to avoid the use of a bounce buffer in this case as well. This has been tested against the test case which motivated Baoquan's changes in the first place [2] which continues to function correctly, as do the vmalloc self tests. This patch (of 4): Commit df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data") introduced the use of a bounce buffer to retrieve kernel text data for /proc/kcore in order to avoid failures arising from hardened user copies enabled by CONFIG_HARDENED_USERCOPY in check_kernel_text_object(). We can avoid doing this if instead of copy_to_user() we use _copy_to_user() which bypasses the hardening check. This is more efficient than using a bounce buffer and simplifies the code. We do so as part an overall effort to eliminate bounce buffer usage in the function with an eye to converting it an iterator read. Link: https://lkml.kernel.org/r/cover.1679566220.git.lstoakes@gmail.com Link: https://lore.kernel.org/all/Y8WfDSRkc%2FOHP3oD@casper.infradead.org/ [1] Link: https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u [2] Link: https://lkml.kernel.org/r/fd39b0bfa7edc76d360def7d034baaee71d90158.1679511146.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand Reviewed-by: Baoquan He Cc: Alexander Viro Cc: Jens Axboe Cc: Jiri Olsa Cc: Liu Shixin Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 71157ee35c1a..556f310d6aa4 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -541,19 +541,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) case KCORE_VMEMMAP: case KCORE_TEXT: /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. + * We use _copy_to_user() to bypass usermode hardening + * which would otherwise prevent this operation. */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + if (_copy_to_user(buffer, (char *)start, tsz)) { + ret = -EFAULT; + goto out; } break; default: -- cgit From 46c0d6d0904a10785faabee53fe53ee1aa718fea Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 18:57:02 +0000 Subject: fs/proc/kcore: convert read_kcore() to read_kcore_iter() For the time being we still use a bounce buffer for vread(), however in the next patch we will convert this to interact directly with the iterator and eliminate the bounce buffer altogether. Link: https://lkml.kernel.org/r/ebe12c8d70eebd71f487d80095605f3ad0d1489c.1679511146.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand Reviewed-by: Baoquan He Cc: Alexander Viro Cc: Jens Axboe Cc: Jiri Olsa Cc: Liu Shixin Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 556f310d6aa4..08b795fd80b4 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -308,9 +308,12 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, } static ssize_t -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { + struct file *file = iocb->ki_filp; char *buf = file->private_data; + loff_t *fpos = &iocb->ki_pos; + size_t phdrs_offset, notes_offset, data_offset; size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; @@ -318,6 +321,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) size_t tsz; int nphdr; unsigned long start; + size_t buflen = iov_iter_count(iter); size_t orig_buflen = buflen; int ret = 0; @@ -356,12 +360,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) }; tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) { ret = -EFAULT; goto out; } - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -398,15 +401,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, - tsz)) { + if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, + iter) != tsz) { kfree(phdrs); ret = -EFAULT; goto out; } kfree(phdrs); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -448,14 +450,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { kfree(notes); ret = -EFAULT; goto out; } kfree(notes); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -497,7 +498,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } if (!m) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -508,14 +509,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) case KCORE_VMALLOC: vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) { + if (copy_to_iter(buf, tsz, iter) != tsz) { ret = -EFAULT; goto out; } break; case KCORE_USER: /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) { + if (copy_to_iter((char *)start, tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -531,7 +532,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (!page || PageOffline(page) || is_page_hwpoison(page) || !pfn_is_ram(pfn)) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -541,17 +542,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) case KCORE_VMEMMAP: case KCORE_TEXT: /* - * We use _copy_to_user() to bypass usermode hardening + * We use _copy_to_iter() to bypass usermode hardening * which would otherwise prevent this operation. */ - if (_copy_to_user(buffer, (char *)start, tsz)) { + if (_copy_to_iter((char *)start, tsz, iter) != tsz) { ret = -EFAULT; goto out; } break; default: pr_warn_once("Unhandled KCORE type: %d\n", m->type); - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -559,7 +560,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) skip: buflen -= tsz; *fpos += tsz; - buffer += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } @@ -603,7 +603,7 @@ static int release_kcore(struct inode *inode, struct file *file) } static const struct proc_ops kcore_proc_ops = { - .proc_read = read_kcore, + .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, .proc_release = release_kcore, .proc_lseek = default_llseek, -- cgit From 4c91c07c93bbbdd7f2d9de2beb7ee5c2a48ad8e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 18:57:04 +0000 Subject: mm: vmalloc: convert vread() to vread_iter() Having previously laid the foundation for converting vread() to an iterator function, pull the trigger and do so. This patch attempts to provide minimal refactoring and to reflect the existing logic as best we can, for example we continue to zero portions of memory not read, as before. Overall, there should be no functional difference other than a performance improvement in /proc/kcore access to vmalloc regions. Now we have eliminated the need for a bounce buffer in read_kcore_iter(), we dispense with it, and try to write to user memory optimistically but with faults disabled via copy_page_to_iter_nofault(). We already have preemption disabled by holding a spin lock. We continue faulting in until the operation is complete. Additionally, we must account for the fact that at any point a copy may fail (most likely due to a fault not being able to occur), we exit indicating fewer bytes retrieved than expected. [sfr@canb.auug.org.au: fix sparc64 warning] Link: https://lkml.kernel.org/r/20230320144721.663280c3@canb.auug.org.au [lstoakes@gmail.com: redo Stephen's sparc build fix] Link: https://lkml.kernel.org/r/8506cbc667c39205e65a323f750ff9c11a463798.1679566220.git.lstoakes@gmail.com [akpm@linux-foundation.org: unbreak uio.h includes] Link: https://lkml.kernel.org/r/941f88bc5ab928e6656e1e2593b91bf0f8c81e1b.1679511146.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Signed-off-by: Stephen Rothwell Reviewed-by: Baoquan He Cc: Alexander Viro Cc: David Hildenbrand Cc: Jens Axboe Cc: Jiri Olsa Cc: Liu Shixin Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 08b795fd80b4..25b44b303b35 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -307,13 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, *i = ALIGN(*i + descsz, 4); } -static ssize_t -read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - char *buf = file->private_data; loff_t *fpos = &iocb->ki_pos; - size_t phdrs_offset, notes_offset, data_offset; size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; @@ -507,13 +503,30 @@ read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) switch (m->type) { case KCORE_VMALLOC: - vread(buf, (char *)start, tsz); - /* we have to zero-fill user buffer even if no read */ - if (copy_to_iter(buf, tsz, iter) != tsz) { - ret = -EFAULT; - goto out; + { + const char *src = (char *)start; + size_t read = 0, left = tsz; + + /* + * vmalloc uses spinlocks, so we optimistically try to + * read memory. If this fails, fault pages in and try + * again until we are done. + */ + while (true) { + read += vread_iter(iter, src, left); + if (read == tsz) + break; + + src += read; + left -= read; + + if (fault_in_iov_iter_writeable(iter, left)) { + ret = -EFAULT; + goto out; + } } break; + } case KCORE_USER: /* User page is handled prior to normal kernel page: */ if (copy_to_iter((char *)start, tsz, iter) != tsz) { @@ -582,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp) if (ret) return ret; - filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -596,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } -static int release_kcore(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - return 0; -} - static const struct proc_ops kcore_proc_ops = { .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, - .proc_release = release_kcore, .proc_lseek = default_llseek, }; -- cgit From 945ea457b5a57cd34cbacce6f1f9e4a2e88432c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Mar 2023 18:45:13 +0100 Subject: xfs: remove xfs_filemap_map_pages() wrapper Patch series "Prevent ->map_pages from sleeping", v2. In preparation for a larger patch series which will handle (some, easy) page faults protected only by RCU, change the two filesystems which have sleeping locks to not take them and hold the RCU lock around calls to ->map_page to prevent other filesystems from adding sleeping locks. This patch (of 3): XFS doesn't actually need to be holding the XFS_MMAPLOCK_SHARED to do this. filemap_map_pages() cannot bring new folios into the page cache and the folio lock is taken during filemap_map_pages() which provides sufficient protection against a truncation or hole punch. Link: https://lkml.kernel.org/r/20230327174515.1811532-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230327174515.1811532-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Dave Chinner Cc: Darrick J. Wong Cc: David Howells Signed-off-by: Andrew Morton --- fs/xfs/xfs_file.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 705250f9f90a..528fc538b6b9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1388,25 +1388,10 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } -static vm_fault_t -xfs_filemap_map_pages( - struct vm_fault *vmf, - pgoff_t start_pgoff, - pgoff_t end_pgoff) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return ret; -} - static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = xfs_filemap_map_pages, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; -- cgit From 0050d7f5ee532f92e8ab1efcec6547bfac527973 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Mar 2023 18:45:14 +0100 Subject: afs: split afs_pagecache_valid() out of afs_validate() For the map_pages() method, we need a test that does not sleep. The page fault handler will continue to call the fault() method where we can sleep and do the full revalidation there. Link: https://lkml.kernel.org/r/20230327174515.1811532-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Howells Tested-by: David Howells Cc: Darrick J. Wong Cc: Dave Chinner Signed-off-by: Andrew Morton --- fs/afs/file.c | 14 ++------------ fs/afs/inode.c | 27 +++++++++++++++++++-------- fs/afs/internal.h | 1 + 3 files changed, 22 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 68d6d5dc608d..719b31374879 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma) static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); - struct afs_file *af = vmf->vma->vm_file->private_data; - switch (afs_validate(vnode, af->key)) { - case 0: + if (afs_pagecache_valid(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); - case -ENOMEM: - return VM_FAULT_OOM; - case -EINTR: - case -ERESTARTSYS: - return VM_FAULT_RETRY; - case -ESTALE: - default: - return VM_FAULT_SIGBUS; - } + return 0; } static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0167e96e5198..b1bdffd5e888 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -667,6 +667,24 @@ bool afs_check_validity(struct afs_vnode *vnode) return false; } +/* + * Returns true if the pagecache is still valid. Does not sleep. + */ +bool afs_pagecache_valid(struct afs_vnode *vnode) +{ + if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); + return true; + } + + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && + afs_check_validity(vnode)) + return true; + + return false; +} + /* * validate a vnode/inode * - there are several things we need to check @@ -684,14 +702,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); - if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->netfs.inode.i_nlink) - clear_nlink(&vnode->netfs.inode); - goto valid; - } - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_check_validity(vnode)) + if (afs_pagecache_valid(vnode)) goto valid; down_write(&vnode->validate_lock); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index ad8523d0d038..5c95df6621f9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); +bool afs_pagecache_valid(struct afs_vnode *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); -- cgit From b4aca54792e76556bb9f8661bab07b4bf2eb4e5f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 5 Apr 2023 11:38:19 +0100 Subject: smaps: fix defined but not used smaps_shmem_walk_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When !CONFIG_SHMEM smaps_shmem_walk_ops is defined but not used, triggering a compiler warning. To avoid the warning remove the #ifdef around the usage. This has no effect because shmem_mapping() is a stub returning false when !CONFIG_SHMEM so the code will be compiled out, however we now need to also provide a stub for shmem_swap_usage(). Link: https://lkml.kernel.org/r/20230405103819.151246-1-steven.price@arm.com Fixes: 7b86ac3371b7 ("pagewalk: separate function pointers from iterator data") Signed-off-by: Steven Price Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202304031749.UiyJpxzF-lkp@intel.com/ Reviewed-by: Jason Gunthorpe Cc: Christoph Hellwig Cc: Alexey Dobriyan Cc: Thomas Hellström Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6a96e1713fd5..cb49479acd2e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (start >= vma->vm_end) return; -#ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, ops = &smaps_shmem_walk_ops; } } -#endif + /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); -- cgit From cd01049d9ca3788a9d1537b64aec26edc96ad0bd Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 11 Apr 2023 14:29:18 +0200 Subject: orangefs: use folios in orangefs_readahead Patch series "remove page_endio()", v3. It was decided to remove the page_endio() as per the previous RFC discussion[1] of this series and move that functionality into the caller itself. One of the side benefit of doing that is the callers have been modified to directly work on folios as page_endio() already worked on folios. As Christoph is doing ZRAM cleanups[4] which will get rid of page_endio() function usage, I removed the final patch that removes page_endio()[5]. I will send it separately after rc-1 once the zram cleanups are merged. mpage changes were tested with a simple boot testing and running a fio workload on ext2 filesystem. orangefs was tested by Mike Marshall (No code changes since he tested). This patch (of 3): Convert orangefs_readahead() from using struct page to struct folio. This conversion removes the call to page_endio() which is soon to be removed, and simplifies the final page handling. The page error flags is not required to be set in the error case as orangefs doesn't depend on them. Link: https://lkml.kernel.org/r/20230411122920.30134-1-p.raghav@samsung.com Link: https://lkml.kernel.org/r/20230411122920.30134-2-p.raghav@samsung.com Link: https://lore.kernel.org/linux-mm/ZBHcl8Pz2ULb4RGD@infradead.org/ [1] Link: https://lore.kernel.org/linux-mm/20230322135013.197076-1-p.raghav@samsung.com/ [2] Link: https://lore.kernel.org/linux-mm/8adb0770-6124-e11f-2551-6582db27ed32@samsung.com/ [3] Link: https://lore.kernel.org/linux-block/20230404150536.2142108-1-hch@lst.de/T/#t [4] Link: https://lore.kernel.org/lkml/20230403132221.94921-6-p.raghav@samsung.com/ [5] Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Tested-by: Mike Marshall Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Cc: Luis Chamberlain Cc: Martin Brandenburg Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- fs/orangefs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aefdf1d3be7c..9014bbcc8031 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -244,7 +244,7 @@ static void orangefs_readahead(struct readahead_control *rac) struct iov_iter iter; struct inode *inode = rac->mapping->host; struct xarray *i_pages; - struct page *page; + struct folio *folio; loff_t new_start = readahead_pos(rac); int ret; size_t new_len = 0; @@ -275,9 +275,10 @@ static void orangefs_readahead(struct readahead_control *rac) ret = 0; /* clean up. */ - while ((page = readahead_page(rac))) { - page_endio(page, false, ret); - put_page(page); + while ((folio = readahead_folio(rac))) { + if (!ret) + folio_mark_uptodate(folio); + folio_unlock(folio); } } -- cgit From f0d6ca46d68670d04a43116fe07309643bac596e Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 11 Apr 2023 14:29:19 +0200 Subject: mpage: split submit_bio and bio end_io handler for reads and writes Split the submit_bio() and bio end_io handler for reads and writes similar to other aops. This is a prep patch before we convert end_io handlers to use folios. Link: https://lkml.kernel.org/r/20230411122920.30134-3-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Cc: Luis Chamberlain Cc: Martin Brandenburg Cc: Matthew Wilcox (Oracle) Cc: Mike Marshall Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- fs/mpage.c | 54 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/mpage.c b/fs/mpage.c index 22b9de5ddd68..d9540c1b7427 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -43,23 +43,41 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio) +static void mpage_read_end_io(struct bio *bio) { struct bio_vec *bv; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - page_endio(page, bio_op(bio), + bio_for_each_segment_all(bv, bio, iter_all) + page_endio(bv->bv_page, REQ_OP_READ, blk_status_to_errno(bio->bi_status)); - } bio_put(bio); } -static struct bio *mpage_bio_submit(struct bio *bio) +static void mpage_write_end_io(struct bio *bio) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) + page_endio(bv->bv_page, REQ_OP_WRITE, + blk_status_to_errno(bio->bi_status)); + + bio_put(bio); +} + +static struct bio *mpage_bio_submit_read(struct bio *bio) +{ + bio->bi_end_io = mpage_read_end_io; + guard_bio_eod(bio); + submit_bio(bio); + return NULL; +} + +static struct bio *mpage_bio_submit_write(struct bio *bio) { - bio->bi_end_io = mpage_end_io; + bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; @@ -265,7 +283,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); alloc_new: if (args->bio == NULL) { @@ -278,7 +296,7 @@ alloc_new: length = first_hole << blkbits; if (!bio_add_folio(args->bio, folio, length, 0)) { - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); goto alloc_new; } @@ -286,7 +304,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -294,7 +312,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); if (!folio_test_uptodate(folio)) block_read_full_folio(folio, args->get_block); else @@ -356,7 +374,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) args.bio = do_mpage_readpage(&args); } if (args.bio) - mpage_bio_submit(args.bio); + mpage_bio_submit_read(args.bio); } EXPORT_SYMBOL(mpage_readahead); @@ -373,7 +391,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) args.bio = do_mpage_readpage(&args); if (args.bio) - mpage_bio_submit(args.bio); + mpage_bio_submit_read(args.bio); return 0; } EXPORT_SYMBOL(mpage_read_folio); @@ -577,7 +595,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); alloc_new: if (bio == NULL) { @@ -596,7 +614,7 @@ alloc_new: wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); goto alloc_new; } @@ -606,7 +624,7 @@ alloc_new: folio_start_writeback(folio); folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -618,7 +636,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); /* * The caller has a ref on the inode, so *mapping is stable @@ -652,7 +670,7 @@ mpage_writepages(struct address_space *mapping, blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) - mpage_bio_submit(mpd.bio); + mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return ret; } -- cgit From 09a607c9cd23d9521e7be3ab5eb94f217d7a37f5 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 11 Apr 2023 14:29:20 +0200 Subject: mpage: use folios in bio end_io handler Use folios in the bio end_io handler. This conversion does the appropriate handling on the folios in the respective end_io callback and removes the call to page_endio(), which is soon to be removed. Link: https://lkml.kernel.org/r/20230411122920.30134-4-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Cc: Luis Chamberlain Cc: Martin Brandenburg Cc: Matthew Wilcox (Oracle) Cc: Mike Marshall Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- fs/mpage.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/mpage.c b/fs/mpage.c index d9540c1b7427..242e213ee064 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -45,24 +45,32 @@ */ static void mpage_read_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bv, bio, iter_all) - page_endio(bv->bv_page, REQ_OP_READ, - blk_status_to_errno(bio->bi_status)); + struct folio_iter fi; + int err = blk_status_to_errno(bio->bi_status); + + bio_for_each_folio_all(fi, bio) { + if (err) + folio_set_error(fi.folio); + else + folio_mark_uptodate(fi.folio); + folio_unlock(fi.folio); + } bio_put(bio); } static void mpage_write_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; + int err = blk_status_to_errno(bio->bi_status); - bio_for_each_segment_all(bv, bio, iter_all) - page_endio(bv->bv_page, REQ_OP_WRITE, - blk_status_to_errno(bio->bi_status)); + bio_for_each_folio_all(fi, bio) { + if (err) { + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, err); + } + folio_end_writeback(fi.folio); + } bio_put(bio); } -- cgit From 0b376f1e0ff555435597fa16823ae0f30b2883e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 12 Apr 2023 10:30:25 +0530 Subject: mm/hugetlb_vmemmap: rename ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP Now we use ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP config option to indicate devdax and hugetlb vmemmap optimization support. Hence rename that to a generic ARCH_WANT_OPTIMIZE_VMEMMAP Link: https://lkml.kernel.org/r/20230412050025.84346-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Muchun Song Cc: Joao Martins Cc: Dan Williams Cc: Mike Kravetz Cc: Tarun Sahu Signed-off-by: Andrew Morton --- fs/Kconfig | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index e99830c65033..cc07a0cd3172 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -250,16 +250,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -# -# Select this config option from the architecture Kconfig, if it is preferred -# to enable the feature of HugeTLB Vmemmap Optimization (HVO). -# -config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP - bool - config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON -- cgit From c7b23b68e2aa93f86a206222d23ccd9a21f5982a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 13 Apr 2023 10:40:34 +0000 Subject: mm: vmscan: refactor updating current->reclaim_state During reclaim, we keep track of pages reclaimed from other means than LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab, which we stash a pointer to in current task_struct. However, we keep track of more than just reclaimed slab pages through this. We also use it for clean file pages dropped through pruned inodes, and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add a helper function that wraps updating it through current, so that future changes to this logic are contained within include/linux/swap.h. Link: https://lkml.kernel.org/r/20230413104034.1086717-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Michal Hocko Cc: Alexander Viro Cc: Christoph Lameter Cc: Darrick J. Wong Cc: Dave Chinner Cc: David Hildenbrand Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: NeilBrown Cc: Peter Xu Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Chen Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- fs/inode.c | 3 +-- fs/xfs/xfs_buf.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 4558dc2f1355..e60fcc41faf1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; + mm_account_reclaimed_pages(reap); } iput(inode); spin_lock(lru_lock); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 54c774af6e1c..15d1e5a7c2d3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -286,8 +286,7 @@ xfs_buf_free_pages( if (bp->b_pages[i]) __free_page(bp->b_pages[i]); } - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += bp->b_page_count; + mm_account_reclaimed_pages(bp->b_page_count); if (bp->b_pages != bp->b_page_array) kmem_free(bp->b_pages); -- cgit From 465e5e6a1698f3325f5d2262d2875779b06b50c7 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:15 +0200 Subject: fs/buffer: add folio_set_bh helper Patch series "convert create_page_buffers to folio_create_buffers". One of the first kernel panic we hit when we try to increase the block size > 4k is inside create_page_buffers()[1]. Even though buffer.c function do not support large folios (folios > PAGE_SIZE) at the moment, these changes are required when we want to remove that constraint. This patch (of 4): The folio version of set_bh_page(). This is required to convert create_page_buffers() to folio_create_buffers() later in the series. Link: https://lkml.kernel.org/r/20230417123618.22094-1-p.raghav@samsung.com Link: https://lkml.kernel.org/r/20230417123618.22094-2-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- fs/buffer.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9e1e2add541e..009d950fbf42 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1485,6 +1485,21 @@ void set_bh_page(struct buffer_head *bh, } EXPORT_SYMBOL(set_bh_page); +void folio_set_bh(struct buffer_head *bh, struct folio *folio, + unsigned long offset) +{ + bh->b_folio = folio; + BUG_ON(offset >= folio_size(folio)); + if (folio_test_highmem(folio)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = folio_address(folio) + offset; +} +EXPORT_SYMBOL(folio_set_bh); + /* * Called when truncating a buffer on a page completely. */ -- cgit From c71124a8afa441af203d2001a92c91f114446687 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:16 +0200 Subject: buffer: add folio_alloc_buffers() helper Folio version of alloc_page_buffers() helper. This is required to convert create_page_buffers() to folio_create_buffers() later in the series. alloc_page_buffers() has been modified to call folio_alloc_buffers() which adds one call to compound_head() but folio_alloc_buffers() removes one call to compound_head() compared to the existing alloc_page_buffers() implementation. Link: https://lkml.kernel.org/r/20230417123618.22094-3-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- fs/buffer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 009d950fbf42..bbd4aa670262 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -843,7 +843,7 @@ int remove_inode_buffers(struct inode *inode) } /* - * Create the appropriate buffers when given a page for data area and + * Create the appropriate buffers when given a folio for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. @@ -851,8 +851,8 @@ int remove_inode_buffers(struct inode *inode) * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations. */ -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - bool retry) +struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, + bool retry) { struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; @@ -862,12 +862,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - /* The page lock pins the memcg */ - memcg = page_memcg(page); + /* The folio lock pins the memcg */ + memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); head = NULL; - offset = PAGE_SIZE; + offset = folio_size(folio); while ((offset -= size) >= 0) { bh = alloc_buffer_head(gfp); if (!bh) @@ -879,8 +879,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bh->b_size = size; - /* Link the buffer to its page */ - set_bh_page(bh, page, offset); + /* Link the buffer to its folio */ + folio_set_bh(bh, folio, offset); } out: set_active_memcg(old_memcg); @@ -899,6 +899,13 @@ no_grow: goto out; } +EXPORT_SYMBOL_GPL(folio_alloc_buffers); + +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, + bool retry) +{ + return folio_alloc_buffers(page_folio(page), size, retry); +} EXPORT_SYMBOL_GPL(alloc_page_buffers); static inline void -- cgit From 8e2e17560bed73c2a73c94cbe378719f6cf850ee Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:17 +0200 Subject: fs/buffer: add folio_create_empty_buffers helper Folio version of create_empty_buffers(). This is required to convert create_page_buffers() to folio_create_buffers() later in the series. It removes several calls to compound_head() as it works directly on folio compared to create_empty_buffers(). Hence, create_empty_buffers() has been modified to call folio_create_empty_buffers(). Link: https://lkml.kernel.org/r/20230417123618.22094-4-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- fs/buffer.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index bbd4aa670262..e6266bf7eeea 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1594,18 +1594,17 @@ out: } EXPORT_SYMBOL(block_invalidate_folio); - /* * We attach and possibly dirty the buffers atomically wrt * block_dirty_folio() via private_lock. try_to_free_buffers - * is already excluded via the page lock. + * is already excluded via the folio lock. */ -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) +void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, + unsigned long b_state) { struct buffer_head *bh, *head, *tail; - head = alloc_page_buffers(page, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, true); bh = head; do { bh->b_state |= b_state; @@ -1614,19 +1613,26 @@ void create_empty_buffers(struct page *page, } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); - if (PageUptodate(page) || PageDirty(page)) { + spin_lock(&folio->mapping->private_lock); + if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { - if (PageDirty(page)) + if (folio_test_dirty(folio)) set_buffer_dirty(bh); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); bh = bh->b_this_page; } while (bh != head); } - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); + folio_attach_private(folio, head); + spin_unlock(&folio->mapping->private_lock); +} +EXPORT_SYMBOL(folio_create_empty_buffers); + +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + folio_create_empty_buffers(page_folio(page), blocksize, b_state); } EXPORT_SYMBOL(create_empty_buffers); -- cgit From c6c8c3e7b47d7d20952202e7568389a5e3b043dd Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 17 Apr 2023 14:36:18 +0200 Subject: fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer do not support large folios as there are many assumptions on the folio size to be the host page size. This conversion is one step towards removing that assumption. Also this conversion will reduce calls to compound_head() if folio_create_buffers() calls folio_create_empty_buffers(). Link: https://lkml.kernel.org/r/20230417123618.22094-5-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Hannes Reinecke Cc: Alexander Viro Cc: Christian Brauner Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- fs/buffer.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index e6266bf7eeea..b0b8c5fba691 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1723,14 +1723,17 @@ static inline int block_size_bits(unsigned int blocksize) return ilog2(blocksize); } -static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +static struct buffer_head *folio_create_buffers(struct folio *folio, + struct inode *inode, + unsigned int b_state) { - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits), - b_state); - return page_buffers(page); + if (!folio_buffers(folio)) + folio_create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), + b_state); + return folio_buffers(folio); } /* @@ -1774,8 +1777,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); - head = create_page_buffers(page, inode, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + head = folio_create_buffers(page_folio(page), inode, + (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio @@ -2038,7 +2041,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - head = create_page_buffers(&folio->page, inode, 0); + head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); @@ -2324,7 +2327,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = create_page_buffers(&folio->page, inode, 0); + head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); -- cgit From d21077fbc2fc987c2e593c34dc3b4d84e546dc9f Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 17 Apr 2023 22:13:41 -0700 Subject: mm: add new KSM process and sysfs knobs This adds the general_profit KSM sysfs knob and the process profit metric knobs to ksm_stat. 1) expose general_profit metric The documentation mentions a general profit metric, however this metric is not calculated. In addition the formula depends on the size of internal structures, which makes it more difficult for an administrator to make the calculation. Adding the metric for a better user experience. 2) document general_profit sysfs knob 3) calculate ksm process profit metric The ksm documentation mentions the process profit metric and how to calculate it. This adds the calculation of the metric. 4) mm: expose ksm process profit metric in ksm_stat This exposes the ksm process profit metric in /proc//ksm_stat. The documentation mentions the formula for the ksm process profit metric, however it does not calculate it. In addition the formula depends on the size of internal structures. So it makes sense to expose it. 5) document new procfs ksm knobs Link: https://lkml.kernel.org/r/20230418051342.1919757-3-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: Bagas Sanjaya Acked-by: David Hildenbrand Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5e0e0ccd47aa..96a6a08c8235 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -3207,6 +3208,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); } -- cgit From 6b008640db7355d8de6ac18f74cedd7ccc92684f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2023 17:40:09 -0400 Subject: mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() Instead of having callers care about the mmap_min_addr logic for the lowest valid mapping address (and some of them getting it wrong), just move the logic into vm_unmapped_area() itself. One less thing for various architecture cases (and generic helpers) to worry about. We should really try to make much more of this be common code, but baby steps.. Without this, vm_unmapped_area() could return an address below mmap_min_addr (because some caller forgot about that). That then causes the mmap machinery to think it has found a workable address, but then later security_mmap_addr(addr) is unhappy about it and the mmap() returns with a nonsensical error (EPERM). The proper action is to either return ENOMEM (if the virtual address space is exhausted), or try to find another address (ie do a bottom-up search for free addresses after the top-down one failed). See commit 2afc745f3e30 ("mm: ensure get_unmapped_area() returns higher address than mmap_min_addr"), which fixed this for one call site (the generic arch_get_unmapped_area_topdown() fallback) but left other cases alone. Link: https://lkml.kernel.org/r/20230418214009.1142926-1-Liam.Howlett@oracle.com Signed-off-by: Linus Torvalds Signed-off-by: Liam R. Howlett Cc: Russell King Cc: Liam Howlett Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 702d79639c0d..ecfdfb2529a3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; -- cgit