From ae3fab5bcc725271a50843e5e284ee20d8b3532b Mon Sep 17 00:00:00 2001 From: Chenyuan Mi Date: Fri, 5 Nov 2021 13:34:45 -0700 Subject: ocfs2: fix handle refcount leak in two exception handling paths The reference counting issue happens in two exception handling paths of ocfs2_replay_truncate_records(). When executing these two exception handling paths, the function forgets to decrease the refcount of handle increased by ocfs2_start_trans(), causing a refcount leak. Fix this issue by using ocfs2_commit_trans() to decrease the refcount of handle in two handling paths. Link: https://lkml.kernel.org/r/20210908102055.10168-1-cymi20@fudan.edu.cn Signed-off-by: Chenyuan Mi Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Reviewed-by: Joseph Qi Cc: Wengang Wang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5d9ae17bd443..1550f18be451 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, data_alloc_bh, start_blk, num_clusters); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } -- cgit From da5e7c87827e8caa6a1eeec6d95dcf74ab592a01 Mon Sep 17 00:00:00 2001 From: Valentin Vidic Date: Fri, 5 Nov 2021 13:34:49 -0700 Subject: ocfs2: cleanup journal init and shutdown Allocate and free struct ocfs2_journal in ocfs2_journal_init and ocfs2_journal_shutdown. Init and release of system inodes references the journal so reorder calls to make sure they work correctly. Link: https://lkml.kernel.org/r/20211009145006.3478-1-vvidic@valentin-vidic.from.hr Signed-off-by: Valentin Vidic Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 4 ++-- fs/ocfs2/journal.c | 26 +++++++++++++++++++++----- fs/ocfs2/journal.h | 3 +-- fs/ocfs2/super.c | 40 +++------------------------------------- 4 files changed, 27 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bc8f32fab964..6c2411c2afcf 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; - journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (journal) { + if (osb->journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4f15750aac5d..b9c339335a53 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; struct inode *inode = NULL; /* the journal inode */ journal_t *j_journal = NULL; + struct ocfs2_journal *journal = NULL; struct ocfs2_dinode *di = NULL; struct buffer_head *bh = NULL; - struct ocfs2_super *osb; int inode_lock = 0; - BUG_ON(!journal); + /* initialize our journal structure */ + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto done; + } + osb->journal = journal; + journal->j_osb = osb; - osb = journal->j_osb; + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = 1UL; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, @@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) journal->j_state = OCFS2_JOURNAL_FREE; -// up_write(&journal->j_trans_barrier); done: iput(inode); + kfree(journal); + osb->journal = NULL; } static void ocfs2_clear_journal_error(struct super_block *sb, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d158acb8b38a..8dcb2f2cadbc 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); -int ocfs2_journal_init(struct ocfs2_journal *journal, - int *dirty); +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5c914ce9b3ac..1286b88b6fa1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - ocfs2_journal_shutdown(osb); - ocfs2_sync_blockdev(sb); ocfs2_purge_refcount_trees(osb); @@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_release_system_inodes(osb); + ocfs2_journal_shutdown(osb); + /* * If we're dismounting due to mount error, mount.ocfs2 will clean * up heartbeat. If we're a local mount, there is no heartbeat. @@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb, int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; - struct ocfs2_journal *journal; struct ocfs2_super *osb; u64 total_blocks; @@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); - /* FIXME - * This should be done in ocfs2_journal_init(), but unknown - * ordering issues will cause the filesystem to crash. - * If anyone wants to figure out what part of the code - * refers to osb->journal before ocfs2_journal_init() is run, - * be my guest. - */ - /* initialize our journal structure */ - - journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); - if (!journal) { - mlog(ML_ERROR, "unable to alloc journal\n"); - status = -ENOMEM; - goto bail; - } - osb->journal = journal; - journal->j_osb = osb; - - atomic_set(&journal->j_num_trans, 0); - init_rwsem(&journal->j_trans_barrier); - init_waitqueue_head(&journal->j_checkpointed); - spin_lock_init(&journal->j_lock); - journal->j_trans_id = (unsigned long) 1; - INIT_LIST_HEAD(&journal->j_la_cleanups); - INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); - journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * ourselves. */ /* Init our journal object. */ - status = ocfs2_journal_init(osb->journal, &dirty); + status = ocfs2_journal_init(osb, &dirty); if (status < 0) { mlog(ML_ERROR, "Could not initialize journal!\n"); goto finally; @@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); - /* FIXME - * This belongs in journal shutdown, but because we have to - * allocate osb->journal at the start of ocfs2_initialize_osb(), - * we free it here. - */ - kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); -- cgit From 848be75d154daf3d2a69a12f97bbe10248e75bf5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Nov 2021 13:34:52 -0700 Subject: ocfs2/dlm: remove redundant assignment of variable ret The variable ret is being assigned a value that is never read, it is updated later on with a different value. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Link: https://lkml.kernel.org/r/20211007233452.30815-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0e7aad1b11cc..5cd5f7511dac 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) continue; } retry: - ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, -- cgit From 839b63860eb3835da165642923120d305925561d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Nov 2021 13:34:55 -0700 Subject: ocfs2: fix data corruption on truncate Patch series "ocfs2: Truncate data corruption fix". As further testing has shown, commit 5314454ea3f ("ocfs2: fix data corruption after conversion from inline format") didn't fix all the data corruption issues the customer started observing after 6dbf7bb55598 ("fs: Don't invalidate page buffers in block_write_full_page()") This time I have tracked them down to two bugs in ocfs2 truncation code. One bug (truncating page cache before clearing tail cluster and setting i_size) could cause data corruption even before 6dbf7bb55598, but before that commit it needed a race with page fault, after 6dbf7bb55598 it started to be pretty deterministic. Another bug (zeroing pages beyond old i_size) used to be harmless inefficiency before commit 6dbf7bb55598. But after commit 6dbf7bb55598 in combination with the first bug it resulted in deterministic data corruption. Although fixing only the first problem is needed to stop data corruption, I've fixed both issues to make the code more robust. This patch (of 2): ocfs2_truncate_file() did unmap invalidate page cache pages before zeroing partial tail cluster and setting i_size. Thus some pages could be left (and likely have left if the cluster zeroing happened) in the page cache beyond i_size after truncate finished letting user possibly see stale data once the file was extended again. Also the tail cluster zeroing was not guaranteed to finish before truncate finished causing possible stale data exposure. The problem started to be particularly easy to hit after commit 6dbf7bb55598 "fs: Don't invalidate page buffers in block_write_full_page()" stopped invalidation of pages beyond i_size from page writeback path. Fix these problems by unmapping and invalidating pages in the page cache after the i_size is reduced and tail cluster is zeroed out. Link: https://lkml.kernel.org/r/20211025150008.29002-1-jack@suse.cz Link: https://lkml.kernel.org/r/20211025151332.11301-1-jack@suse.cz Fixes: ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem") Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 54d7843c0211..fc5f780fa235 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode, * greater than page size, so we have to truncate them * anyway. */ - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + unmap_mapping_range(inode->i_mapping, + new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); status = ocfs2_truncate_inline(inode, di_bh, new_i_size, i_size_read(inode), 1); if (status) @@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); -- cgit From c7c14a369de936957946b3843aae050d92451472 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Nov 2021 13:34:58 -0700 Subject: ocfs2: do not zero pages beyond i_size ocfs2_zero_range_for_truncate() can try to zero pages beyond current inode size despite the fact that underlying blocks should be already zeroed out and writeback will skip writing such pages anyway. Avoid the pointless work. Link: https://lkml.kernel.org/r/20211025151332.11301-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 1550f18be451..bb247bc349e4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6923,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, } /* - * Zero the area past i_size but still within an allocated - * cluster. This avoids exposing nonzero data on subsequent file - * extends. + * Zero partial cluster for a hole punch or truncate. This avoids exposing + * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because * otherwise block_write_full_page() will skip writeout of pages past - * i_size. The new_i_size parameter is passed for this reason. + * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) @@ -6947,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) return 0; + /* + * Avoid zeroing pages fully beyond current i_size. It is pointless as + * underlying blocks of those pages should be already zeroed out and + * page writeback will skip them anyway. + */ + range_end = min_t(u64, range_end, i_size_read(inode)); + if (range_start >= range_end) + return 0; + pages = kcalloc(ocfs2_pages_per_cluster(sb), sizeof(struct page *), GFP_NOFS); if (pages == NULL) { @@ -6955,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, goto out; } - if (range_start == range_end) - goto out; - ret = ocfs2_extent_map_get_blocks(inode, range_start >> sb->s_blocksize_bits, &phys, NULL, &ext_flags); -- cgit From d1cef29adc22938d778b4652af34e72facd8184b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Nov 2021 13:35:01 -0700 Subject: fs/posix_acl.c: avoid -Wempty-body warning The fallthrough comment for an ignored cmpxchg() return value produces a harmless warning with 'make W=1': fs/posix_acl.c: In function 'get_acl': fs/posix_acl.c:127:36: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 127 | /* fall through */ ; | ^ Simplify it as a step towards a clean W=1 build. As all architectures define cmpxchg() as a statement expression these days, it is no longer necessary to evaluate its return code, and the if() can just be droped. Link: https://lkml.kernel.org/r/20210927102410.1863853-1-arnd@kernel.org Link: https://lore.kernel.org/all/20210322132103.qiun2rjilnlgztxe@wittgenstein/ Signed-off-by: Arnd Bergmann Reviewed-by: Christian Brauner Cc: Alexander Viro Cc: James Morris Cc: Serge Hallyn Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/posix_acl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index f5c25f580dd9..9323a854a60a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) * to just call ->get_acl to fetch the ACL ourself. (This is going to * be an unlikely race.) */ - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) - /* fall through */ ; + cmpxchg(p, ACL_NOT_CACHED, sentinel); /* * Normally, the ACL returned by ->get_acl will be cached. -- cgit From d41b60359ffb24a39c93ea1f4bffaafd651118c3 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 5 Nov 2021 13:35:04 -0700 Subject: d_path: fix Kernel doc validator complaining Kernel doc validator complains: Function parameter or member 'p' not described in 'prepend_name' Excess function parameter 'buffer' description in 'prepend_name' Link: https://lkml.kernel.org/r/20211011005614.26189-1-justin.he@arm.com Fixes: ad08ae586586 ("d_path: introduce struct prepend_buffer") Signed-off-by: Jia He Reviewed-by: Andy Shevchenko Acked-by: Randy Dunlap Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/d_path.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/d_path.c b/fs/d_path.c index cd60c7535181..e4e0ebad1f15 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen) /** * prepend_name - prepend a pathname in front of current buffer pointer - * @buffer: buffer pointer - * @buflen: allocated length of the buffer - * @name: name string and length qstr structure + * @p: prepend buffer which contains buffer pointer and allocated length + * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are @@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry - * @buffer: pointer to the end of the buffer - * @buflen: pointer to buffer length + * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. -- cgit From 10c848c8b4805ff026018bfb2b3dedd90e7b0cea Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:35:59 -0700 Subject: mm/smaps: fix shmem pte hole swap calculation Patch series "mm/smaps: Fixes and optimizations on shmem swap handling". This patch (of 3): The shmem swap calculation on the privately writable mappings are using wrong parameters as spotted by Vlastimil. Fix them. This was introduced in commit 48131e03ca4e ("mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem mappings"), when shmem_swap_usage was reworked to shmem_partial_swap_usage. Test program: void main(void) { char *buffer, *p; int i, fd; fd = memfd_create("test", 0); assert(fd > 0); /* isize==2M*3, fill in pages, swap them out */ ftruncate(fd, SIZE_2M * 3); buffer = mmap(NULL, SIZE_2M * 3, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); assert(buffer); for (i = 0, p = buffer; i < SIZE_2M * 3 / 4096; i++) { *p = 1; p += 4096; } madvise(buffer, SIZE_2M * 3, MADV_PAGEOUT); munmap(buffer, SIZE_2M * 3); /* * Remap with private+writtable mappings on partial of the inode (<= 2M*3), * while the size must also be >= 2M*2 to make sure there's a none pmd so * smaps_pte_hole will be triggered. */ buffer = mmap(NULL, SIZE_2M * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); printf("pid=%d, buffer=%p\n", getpid(), buffer); /* Check /proc/$PID/smap_rollup, should see 4MB swap */ sleep(1000000); } Before the patch, smaps_rollup shows <4MB swap and the number will be random depending on the alignment of the buffer of mmap() allocated. After this patch, it'll show 4MB. Link: https://lkml.kernel.org/r/20210917164756.8586-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210917164756.8586-2-peterx@redhat.com Fixes: 48131e03ca4e ("mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem mappings") Signed-off-by: Peter Xu Reported-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cf25be3e0321..2197f669e17b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -478,9 +478,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; - mss->swap += shmem_partial_swap_usage( - walk->vma->vm_file->f_mapping, addr, end); + mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, + linear_page_index(vma, addr), + linear_page_index(vma, end)); return 0; } -- cgit From 230100321518a4e3a027b3b1b7e93b3e02324def Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:36:05 -0700 Subject: mm/smaps: simplify shmem handling of pte holes Firstly, check_shmem_swap variable is actually not necessary, because it's always set with pte_hole hook; checking each would work. Meanwhile, the check within smaps_pte_entry is not easy to follow. E.g., pte_none() check is not needed as "!pte_present && !is_swap_pte" is the same. Since at it, use the pte_hole() helper rather than dup the page cache lookup. Still keep the CONFIG_SHMEM part so the code can be optimized to nop for !SHMEM. There will be a very slight functional change in smaps_pte_entry(), that for !SHMEM we'll return early for pte_none (before checking page==NULL), but that's even nicer. Link: https://lkml.kernel.org/r/20210917164756.8586-4-peterx@redhat.com Signed-off-by: Peter Xu Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2197f669e17b..ad667dbc96f5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -397,7 +397,6 @@ struct mem_size_stats { u64 pss_shmem; u64 pss_locked; u64 swap_pss; - bool check_shmem_swap; }; static void smaps_page_accumulate(struct mem_size_stats *mss, @@ -490,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ +static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) +{ +#ifdef CONFIG_SHMEM + if (walk->ops->pte_hole) { + /* depth is not used */ + smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); + } +#endif +} + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -518,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); - } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap - && pte_none(*pte))) { - page = xa_load(&vma->vm_file->f_mapping->i_pages, - linear_page_index(vma, addr)); - if (xa_is_value(page)) - mss->swap += PAGE_SIZE; + } else { + smaps_pte_hole_lookup(addr, walk); return; } @@ -737,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, return; #ifdef CONFIG_SHMEM - /* In case of smaps_rollup, reset the value from previous vma */ - mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -756,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { - mss->check_shmem_swap = true; ops = &smaps_shmem_walk_ops; } } -- cgit From 0b3ea0926afb8dde70cfab00316ae0a70b93a7cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:58 -0700 Subject: fs: explicitly unregister per-superblock BDIs Add a new SB_I_ flag to mark superblocks that have an ephemeral bdi associated with them, and unregister it when the superblock is shut down. Link: https://lkml.kernel.org/r/20211021124441.668816-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index bcef3a6f4c4b..3bfc0f8fbd5b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb_lock); up_write(&sb->s_umount); if (sb->s_bdi != &noop_backing_dev_info) { + if (sb->s_iflags & SB_I_PERSB_BDI) + bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } @@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; + sb->s_iflags |= SB_I_PERSB_BDI; return 0; } -- cgit From 55fc0d91746759c71bc165bba62a2db64ac98e35 Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Fri, 5 Nov 2021 13:43:41 -0700 Subject: mm, thp: lock filemap when truncating page cache Patch series "fix two bugs for file THP". This patch (of 2): Transparent huge page has supported read-only non-shmem files. The file- backed THP is collapsed by khugepaged and truncated when written (for shared libraries). However, there is a race when multiple writers truncate the same page cache concurrently. In that case, subpage(s) of file THP can be revealed by find_get_entry in truncate_inode_pages_range, which will trigger PageTail BUG_ON in truncate_inode_page, as follows: page:000000009e420ff2 refcount:1 mapcount:0 mapping:0000000000000000 index:0x7ff pfn:0x50c3ff head:0000000075ff816d order:9 compound_mapcount:0 compound_pincount:0 flags: 0x37fffe0000010815(locked|uptodate|lru|arch_1|head) raw: 37fffe0000000000 fffffe0013108001 dead000000000122 dead000000000400 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 head: 37fffe0000010815 fffffe001066bd48 ffff000404183c20 0000000000000000 head: 0000000000000600 0000000000000000 00000001ffffffff ffff000c0345a000 page dumped because: VM_BUG_ON_PAGE(PageTail(page)) ------------[ cut here ]------------ kernel BUG at mm/truncate.c:213! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: xfs(E) libcrc32c(E) rfkill(E) ... CPU: 14 PID: 11394 Comm: check_madvise_d Kdump: ... Hardware name: ECS, BIOS 0.0.0 02/06/2015 pstate: 60400005 (nZCv daif +PAN -UAO -TCO BTYPE=--) Call trace: truncate_inode_page+0x64/0x70 truncate_inode_pages_range+0x550/0x7e4 truncate_pagecache+0x58/0x80 do_dentry_open+0x1e4/0x3c0 vfs_open+0x38/0x44 do_open+0x1f0/0x310 path_openat+0x114/0x1dc do_filp_open+0x84/0x134 do_sys_openat2+0xbc/0x164 __arm64_sys_openat+0x74/0xc0 el0_svc_common.constprop.0+0x88/0x220 do_el0_svc+0x30/0xa0 el0_svc+0x20/0x30 el0_sync_handler+0x1a4/0x1b0 el0_sync+0x180/0x1c0 Code: aa0103e0 900061e1 910ec021 9400d300 (d4210000) This patch mainly to lock filemap when one enter truncate_pagecache(), avoiding truncating the same page cache concurrently. Link: https://lkml.kernel.org/r/20211025092134.18562-1-rongwei.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/20211025092134.18562-2-rongwei.wang@linux.alibaba.com Fixes: eb6ecbed0aa2 ("mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs") Signed-off-by: Xu Yu Signed-off-by: Rongwei Wang Suggested-by: Matthew Wilcox (Oracle) Tested-by: Song Liu Cc: Collin Fijalkovich Cc: Hugh Dickins Cc: Mike Kravetz Cc: William Kucharski Cc: Yang Shi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index daa324606a41..9ec3cfca3b1a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -856,8 +856,11 @@ static int do_dentry_open(struct file *f, * of THPs into the page cache will fail. */ smp_mb(); - if (filemap_nr_thps(inode->i_mapping)) + if (filemap_nr_thps(inode->i_mapping)) { + filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, 0); + filemap_invalidate_unlock(inode->i_mapping); + } } return 0; -- cgit From 8468e937df1f31411d1e127fa38db064af051fe5 Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Fri, 5 Nov 2021 13:43:44 -0700 Subject: mm, thp: fix incorrect unmap behavior for private pages When truncating pagecache on file THP, the private pages of a process should not be unmapped mapping. This incorrect behavior on a dynamic shared libraries which will cause related processes to happen core dump. A simple test for a DSO (Prerequisite is the DSO mapped in file THP): int main(int argc, char *argv[]) { int fd; fd = open(argv[1], O_WRONLY); if (fd < 0) { perror("open"); } close(fd); return 0; } The test only to open a target DSO, and do nothing. But this operation will lead one or more process to happen core dump. This patch mainly to fix this bug. Link: https://lkml.kernel.org/r/20211025092134.18562-3-rongwei.wang@linux.alibaba.com Fixes: eb6ecbed0aa2 ("mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs") Signed-off-by: Rongwei Wang Tested-by: Xu Yu Cc: Matthew Wilcox (Oracle) Cc: Song Liu Cc: William Kucharski Cc: Hugh Dickins Cc: Yang Shi Cc: Mike Kravetz Cc: Collin Fijalkovich Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 9ec3cfca3b1a..e0df1536eb69 100644 --- a/fs/open.c +++ b/fs/open.c @@ -857,8 +857,17 @@ static int do_dentry_open(struct file *f, */ smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { + struct address_space *mapping = inode->i_mapping; + filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache(inode, 0); + /* + * unmap_mapping_range just need to be called once + * here, because the private pages is not need to be + * unmapped mapping (e.g. data segment of dynamic + * shared libraries here). + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } -- cgit