From cf264e1329fb0307e044f7675849f9f38b44c11a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 2 May 2023 18:36:07 -0700 Subject: cachestat: implement cachestat syscall There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [nphamcs@gmail.com: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/20230504022044.3675469-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Brian Foster Cc: Matthew Wilcox (Oracle) Cc: Michael Kerrisk Signed-off-by: Andrew Morton --- mm/filemap.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..2d3d70c64dfd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -4119,3 +4122,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ -- cgit From c963901197188189e85b4d768a059fe1bbc2a502 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Wed, 10 May 2023 14:47:16 +0200 Subject: filemap: remove page_endio() page_endio() is not used anymore. Remove it. Link: https://lkml.kernel.org/r/20230510124716.73655-1-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Acked-by: Matthew Wilcox (Oracle) Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- mm/filemap.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2d3d70c64dfd..570bc8c3db87 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1628,36 +1628,6 @@ void folio_end_writeback(struct folio *folio) } EXPORT_SYMBOL(folio_end_writeback); -/* - * After completing I/O on a page, call this routine to update the page - * flags appropriately - */ -void page_endio(struct page *page, bool is_write, int err) -{ - struct folio *folio = page_folio(page); - - if (!is_write) { - if (!err) { - folio_mark_uptodate(folio); - } else { - folio_clear_uptodate(folio); - folio_set_error(folio); - } - folio_unlock(folio); - } else { - if (err) { - struct address_space *mapping; - - folio_set_error(folio); - mapping = folio_mapping(folio); - if (mapping) - mapping_set_error(mapping, err); - } - folio_end_writeback(folio); - } -} -EXPORT_SYMBOL_GPL(page_endio); - /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock -- cgit From 0d625446d0a451a683a357799912b9e688629707 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:53 +0200 Subject: backing_dev: remove current->backing_dev_info Patch series "cleanup the filemap / direct I/O interaction", v4. This series cleans up some of the generic write helper calling conventions and the page cache writeback / invalidation for direct I/O. This is a spinoff from the no-bufferhead kernel project, for which we'll want to an use iomap based buffered write path in the block layer. This patch (of 12): The last user of current->backing_dev_info disappeared in commit b9b1335e6403 ("remove bdi_congested() and wb_congested() and related functions"). Remove the field and all assignments to it. Link: https://lkml.kernel.org/r/20230601145904.1385409-1-hch@lst.de Link: https://lkml.kernel.org/r/20230601145904.1385409-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Darrick J. Wong Acked-by: Theodore Ts'o Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- mm/filemap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 570bc8c3db87..0d371ed91a68 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3964,8 +3964,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t err; ssize_t status; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -4026,7 +4024,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos += written; } out: - current->backing_dev_info = NULL; return written ? written : err; } EXPORT_SYMBOL(__generic_file_write_iter); -- cgit From 182c25e9c157f37bd0ab5a82fe2417e2223df459 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:55 +0200 Subject: filemap: update ki_pos in generic_perform_write All callers of generic_perform_write need to updated ki_pos, move it into common code. Link: https://lkml.kernel.org/r/20230601145904.1385409-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Xiubo Li Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Theodore Ts'o Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 0d371ed91a68..3a80a69fa9fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3930,7 +3930,10 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); - return written ? written : status; + if (!written) + return status; + iocb->ki_pos += written; + return written; } EXPORT_SYMBOL(generic_perform_write); @@ -4007,7 +4010,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) endbyte = pos + status - 1; err = filemap_write_and_wait_range(mapping, pos, endbyte); if (err == 0) { - iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, @@ -4020,8 +4022,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } } else { written = generic_perform_write(iocb, from); - if (likely(written > 0)) - iocb->ki_pos += written; } out: return written ? written : err; -- cgit From 3c435a0fe35c220bec442dffff04a64aacf952b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:56 +0200 Subject: filemap: add a kiocb_write_and_wait helper Factor out a helper that does filemap_write_and_wait_range for the range covered by a read kiocb, or returns -EAGAIN if the kiocb is marked as nowait and there would be pages to write. Link: https://lkml.kernel.org/r/20230601145904.1385409-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- mm/filemap.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 3a80a69fa9fa..5566e10ca1a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2735,6 +2735,21 @@ put_folios: } EXPORT_SYMBOL_GPL(filemap_read); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) + return -EAGAIN; + return 0; + } + + return filemap_write_and_wait_range(mapping, pos, end); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -2770,18 +2785,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; - } else { - retval = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - return retval; - } - + retval = kiocb_write_and_wait(iocb, count); + if (retval < 0) + return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); -- cgit From e003f74afbd2feadbb9ffbf9135e2d2fb5d320a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:57 +0200 Subject: filemap: add a kiocb_invalidate_pages helper Factor out a helper that calls filemap_write_and_wait_range and invalidate_inode_pages2_range for the range covered by a write kiocb or returns -EAGAIN if the kiocb is marked as nowait and there would be pages to write or invalidate. Link: https://lkml.kernel.org/r/20230601145904.1385409-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- mm/filemap.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 5566e10ca1a7..6ba6233c4bbb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2750,6 +2750,33 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) return filemap_write_and_wait_range(mapping, pos, end); } +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + int ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* we could block if there are any pages in the range */ + if (filemap_range_has_page(mapping, pos, end)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, pos, end); + if (ret) + return ret; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -3793,30 +3820,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - if (iocb->ki_flags & IOCB_NOWAIT) { - /* If there are pages to writeback, return */ - if (filemap_range_has_page(file->f_mapping, pos, - pos + write_len - 1)) - return -EAGAIN; - } else { - written = filemap_write_and_wait_range(mapping, pos, - pos + write_len - 1); - if (written) - goto out; - } - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ + written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; -- cgit From c402a9a9430b670926decbb284b756ee6f47c1ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:58 +0200 Subject: filemap: add a kiocb_invalidate_post_direct_write helper Add a helper to invalidate page cache after a dio write. Link: https://lkml.kernel.org/r/20230601145904.1385409-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- mm/filemap.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 6ba6233c4bbb..b45506f74133 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3789,7 +3789,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ -void dio_warn_stale_pagecache(struct file *filp) +static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; @@ -3806,19 +3806,23 @@ void dio_warn_stale_pagecache(struct file *filp) } } +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (mapping->nrpages && + invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) + dio_warn_stale_pagecache(iocb->ki_filp); +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; - ssize_t written; - size_t write_len; - pgoff_t end; - - write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_SHIFT; + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t write_len = iov_iter_count(from); + ssize_t written; /* * If a page can not be invalidated, return 0 to fall back @@ -3828,7 +3832,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (written) { if (written == -EBUSY) return 0; - goto out; + return written; } written = mapping->a_ops->direct_IO(iocb, from); @@ -3850,11 +3854,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages && - invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) - dio_warn_stale_pagecache(file); - if (written > 0) { + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + + kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -3865,7 +3869,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); -out: return written; } EXPORT_SYMBOL(generic_file_direct_write); -- cgit From 44fff0fa08ec5a6d9d5fb05443a36d854d0ece4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:59:01 +0200 Subject: fs: factor out a direct_write_fallback helper Add a helper dealing with handling the syncing of a buffered write fallback for direct I/O. Link: https://lkml.kernel.org/r/20230601145904.1385409-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Miklos Szeredi Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Hannes Reinecke Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- mm/filemap.c | 66 ++++++++++++++---------------------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b45506f74133..916b7c6444fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3979,23 +3979,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t written = 0; - ssize_t err; - ssize_t status; + struct inode *inode = mapping->host; + ssize_t ret; - err = file_remove_privs(file); - if (err) - goto out; + ret = file_remove_privs(file); + if (ret) + return ret; - err = file_update_time(file); - if (err) - goto out; + ret = file_update_time(file); + if (ret) + return ret; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos, endbyte; - - written = generic_file_direct_write(iocb, from); + ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -4003,45 +3999,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) - goto out; - - pos = iocb->ki_pos; - status = generic_perform_write(iocb, from); - /* - * If generic_perform_write() returned a synchronous error - * then we want to return the number of bytes which were - * direct-written, or the error code if that was zero. Note - * that this differs from normal direct-io semantics, which - * will return -EFOO even if some bytes were written. - */ - if (unlikely(status < 0)) { - err = status; - goto out; - } - /* - * We need to ensure that the page cache pages are written to - * disk and invalidated to preserve the expected O_DIRECT - * semantics. - */ - endbyte = pos + status - 1; - err = filemap_write_and_wait_range(mapping, pos, endbyte); - if (err == 0) { - written += status; - invalidate_mapping_pages(mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - } else { - /* - * We don't know how much we wrote, so just return - * the number of bytes which were direct-written - */ - } - } else { - written = generic_perform_write(iocb, from); + if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) + return ret; + return direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); } -out: - return written ? written : err; + + return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); -- cgit From 0cb8fd4d14165a7e654048e43983d86f75b90879 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:08:20 -0700 Subject: mm/migrate: remove cruft from migration_entry_wait()s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migration_entry_wait_on_locked() does not need to take a mapped pte pointer, its callers can do the unmap first. Annotate it with __releases(ptl) to reduce sparse warnings. Fold __migration_entry_wait_huge() into migration_entry_wait_huge(). Fold __migration_entry_wait() into migration_entry_wait(), preferring the tighter pte_offset_map_lock() to pte_offset_map() and pte_lockptr(). Link: https://lkml.kernel.org/r/b0e2a532-cdf2-561b-e999-f3b13b8d6d3@google.com Signed-off-by: Hugh Dickins Reviewed-by: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- mm/filemap.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 916b7c6444fe..e0259fb823a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1362,8 +1362,6 @@ repeat: /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. - * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required - * for pte entries, pass NULL for pmd entries. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is @@ -1372,13 +1370,13 @@ repeat: * should be called while holding the ptl for the migration entry referencing * the page. * - * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1412,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ - if (ptep) - pte_unmap_unlock(ptep, ptl); - else - spin_unlock(ptl); + spin_unlock(ptl); for (;;) { unsigned int flags; -- cgit From 65747aaf42b7db6acb8e57a2b8e9959928f404dd Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:11:29 -0700 Subject: mm/filemap: allow pte_offset_map_lock() to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filemap_map_pages() allow pte_offset_map_lock() to fail; and remove the pmd_devmap_trans_unstable() check from filemap_map_pmd(), which can safely return to filemap_map_pages() and let pte_offset_map_lock() discover that. Link: https://lkml.kernel.org/r/54607cf4-ddb6-7ef3-043-1d2de1a9a71@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- mm/filemap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index e0259fb823a5..1893048ec9ff 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3414,13 +3414,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd)) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); - /* See comment in handle_pte_fault() */ - if (pmd_devmap_trans_unstable(vmf->pmd)) { - folio_unlock(folio); - folio_put(folio); - return true; - } - return false; } @@ -3507,6 +3500,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + if (!vmf->pte) { + folio_unlock(folio); + folio_put(folio); + goto out; + } do { again: page = folio_file_page(folio, xas.xa_index); -- cgit From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 1893048ec9ff..00933089b8b6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3523,7 +3523,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ -- cgit From 6c77b607ee26472fb945aa41734281c39d06d68f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 14 Jun 2023 22:36:12 +0800 Subject: mm: kill lock|unlock_page_memcg() Since commit c7c3dec1c9db ("mm: rmap: remove lock_page_memcg()"), no more user, kill lock_page_memcg() and unlock_page_memcg(). Link: https://lkml.kernel.org/r/20230614143612.62575-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 00933089b8b6..758bbdf300e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -117,7 +117,7 @@ * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) -- cgit From 16f8eb3eea9eb2a1568279d64ca4dc977e7aa538 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 21 Jun 2023 14:24:02 -0700 Subject: Revert "page cache: fix page_cache_next/prev_miss off by one" This reverts commit 9425c591e06a9ab27a145ba655fb50532cf0bcc9 The reverted commit fixed up routines primarily used by readahead code such that they could also be used by hugetlb. Unfortunately, this caused a performance regression as pointed out by the Closes: tag. The hugetlb code which uses page_cache_next_miss will be addressed in a subsequent patch. Link: https://lkml.kernel.org/r/20230621212403.174710-1-mike.kravetz@oracle.com Fixes: 9425c591e06a ("page cache: fix page_cache_next/prev_miss off by one") Signed-off-by: Mike Kravetz Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202306211346.1e9ff03e-oliver.sang@intel.com Reviewed-by: Sidhartha Kumar Cc: Ackerley Tng Cc: Erdem Aktas Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Muchun Song Cc: Vishal Annapurve Signed-off-by: Andrew Morton --- mm/filemap.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index c20e0b1997e8..758bbdf300e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1728,9 +1728,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). - * In the rare case of index wrap-around, 0 will be returned. 0 will also - * be returned if index == 0 and there is a gap at the index. We can not - * wrap-around if passed index == 0. + * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1740,13 +1738,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == 0 && index != 0) - return xas.xa_index; + break; + if (xas.xa_index == 0) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index + 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1767,9 +1764,7 @@ EXPORT_SYMBOL(page_cache_next_miss); * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). - * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX - * will also be returned if index == ULONG_MAX and there is a gap at the - * index. We can not wrap-around if passed index == ULONG_MAX. + * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1779,13 +1774,12 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == ULONG_MAX && index != ULONG_MAX) - return xas.xa_index; + break; + if (xas.xa_index == ULONG_MAX) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index - 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); -- cgit