summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 10:28:11 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 10:28:11 -0700
commit6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch)
tree2c425707f78642625dbe2c824c7fded2021e3dc7 /mm/filemap.c
parent6aeadf7896bff4ca230702daba8788455e6b866e (diff)
parentacc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff)
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton: - Yosry Ahmed brought back some cgroup v1 stats in OOM logs - Yosry has also eliminated cgroup's atomic rstat flushing - Nhat Pham adds the new cachestat() syscall. It provides userspace with the ability to query pagecache status - a similar concept to mincore() but more powerful and with improved usability - Mel Gorman provides more optimizations for compaction, reducing the prevalence of page rescanning - Lorenzo Stoakes has done some maintanance work on the get_user_pages() interface - Liam Howlett continues with cleanups and maintenance work to the maple tree code. Peng Zhang also does some work on maple tree - Johannes Weiner has done some cleanup work on the compaction code - David Hildenbrand has contributed additional selftests for get_user_pages() - Thomas Gleixner has contributed some maintenance and optimization work for the vmalloc code - Baolin Wang has provided some compaction cleanups, - SeongJae Park continues maintenance work on the DAMON code - Huang Ying has done some maintenance on the swap code's usage of device refcounting - Christoph Hellwig has some cleanups for the filemap/directio code - Ryan Roberts provides two patch series which yield some rationalization of the kernel's access to pte entries - use the provided APIs rather than open-coding accesses - Lorenzo Stoakes has some fixes to the interaction between pagecache and directio access to file mappings - John Hubbard has a series of fixes to the MM selftesting code - ZhangPeng continues the folio conversion campaign - Hugh Dickins has been working on the pagetable handling code, mainly with a view to reducing the load on the mmap_lock - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment from 128 to 8 - Domenico Cerasuolo has improved the zswap reclaim mechanism by reorganizing the LRU management - Matthew Wilcox provides some fixups to make gfs2 work better with the buffer_head code - Vishal Moola also has done some folio conversion work - Matthew Wilcox has removed the remnants of the pagevec code - their functionality is migrated over to struct folio_batch * tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits) mm/hugetlb: remove hugetlb_set_page_subpool() mm: nommu: correct the range of mmap_sem_read_lock in task_mem() hugetlb: revert use of page_cache_next_miss() Revert "page cache: fix page_cache_next/prev_miss off by one" mm/vmscan: fix root proactive reclaim unthrottling unbalanced node mm: memcg: rename and document global_reclaim() mm: kill [add|del]_page_to_lru_list() mm: compaction: convert to use a folio in isolate_migratepages_block() mm: zswap: fix double invalidate with exclusive loads mm: remove unnecessary pagevec includes mm: remove references to pagevec mm: rename invalidate_mapping_pagevec to mapping_try_invalidate mm: remove struct pagevec net: convert sunrpc from pagevec to folio_batch i915: convert i915_gpu_error to use a folio_batch pagevec: rename fbatch_count() mm: remove check_move_unevictable_pages() drm: convert drm_gem_put_pages() to use a folio_batch i915: convert shmem_sg_free_table() to use a folio_batch scatterlist: add sg_set_folio() ...
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c450
1 files changed, 278 insertions, 172 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 00f01d8ead47..9e44a49bbd74 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -58,6 +59,8 @@
#include <asm/mman.h>
+#include "swap.h"
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -114,7 +117,7 @@
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
+ * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
@@ -1359,8 +1362,6 @@ repeat:
/**
* migration_entry_wait_on_locked - Wait for a migration entry to be removed
* @entry: migration swap entry.
- * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
- * for pte entries, pass NULL for pmd entries.
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
@@ -1369,13 +1370,13 @@ repeat:
* should be called while holding the ptl for the migration entry referencing
* the page.
*
- * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
+ * Returns after unlocking the ptl.
*
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
-void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+ __releases(ptl)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
@@ -1409,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
* a valid reference to the page, and it must take the ptl to remove the
* migration entry. So the page is valid until the ptl is dropped.
*/
- if (ptep)
- pte_unmap_unlock(ptep, ptl);
- else
- spin_unlock(ptl);
+ spin_unlock(ptl);
for (;;) {
unsigned int flags;
@@ -1625,36 +1623,6 @@ void folio_end_writeback(struct folio *folio)
}
EXPORT_SYMBOL(folio_end_writeback);
-/*
- * After completing I/O on a page, call this routine to update the page
- * flags appropriately
- */
-void page_endio(struct page *page, bool is_write, int err)
-{
- struct folio *folio = page_folio(page);
-
- if (!is_write) {
- if (!err) {
- folio_mark_uptodate(folio);
- } else {
- folio_clear_uptodate(folio);
- folio_set_error(folio);
- }
- folio_unlock(folio);
- } else {
- if (err) {
- struct address_space *mapping;
-
- folio_set_error(folio);
- mapping = folio_mapping(folio);
- if (mapping)
- mapping_set_error(mapping, err);
- }
- folio_end_writeback(folio);
- }
-}
-EXPORT_SYMBOL_GPL(page_endio);
-
/**
* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
* @folio: The folio to lock
@@ -1760,9 +1728,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'return - index >= max_scan' will be true).
- * In the rare case of index wrap-around, 0 will be returned. 0 will also
- * be returned if index == 0 and there is a gap at the index. We can not
- * wrap-around if passed index == 0.
+ * In the rare case of index wrap-around, 0 will be returned.
*/
pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
@@ -1772,13 +1738,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
- return xas.xa_index;
- if (xas.xa_index == 0 && index != 0)
- return xas.xa_index;
+ break;
+ if (xas.xa_index == 0)
+ break;
}
- /* No gaps in range and no wrap-around, return index beyond range */
- return xas.xa_index + 1;
+ return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_next_miss);
@@ -1799,9 +1764,7 @@ EXPORT_SYMBOL(page_cache_next_miss);
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'index - return >= max_scan' will be true).
- * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX
- * will also be returned if index == ULONG_MAX and there is a gap at the
- * index. We can not wrap-around if passed index == ULONG_MAX.
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
*/
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
@@ -1811,13 +1774,12 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_prev(&xas);
if (!entry || xa_is_value(entry))
- return xas.xa_index;
- if (xas.xa_index == ULONG_MAX && index != ULONG_MAX)
- return xas.xa_index;
+ break;
+ if (xas.xa_index == ULONG_MAX)
+ break;
}
- /* No gaps in range and no wrap-around, return index beyond range */
- return xas.xa_index - 1;
+ return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);
@@ -2767,6 +2729,48 @@ put_folios:
}
EXPORT_SYMBOL_GPL(filemap_read);
+int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_needs_writeback(mapping, pos, end))
+ return -EAGAIN;
+ return 0;
+ }
+
+ return filemap_write_and_wait_range(mapping, pos, end);
+}
+
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+ int ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* we could block if there are any pages in the range */
+ if (filemap_range_has_page(mapping, pos, end))
+ return -EAGAIN;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+}
+
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
@@ -2802,18 +2806,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
- return -EAGAIN;
- } else {
- retval = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- return retval;
- }
-
+ retval = kiocb_write_and_wait(iocb, count);
+ if (retval < 0)
+ return retval;
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
@@ -3436,13 +3431,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd))
pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
- /* See comment in handle_pte_fault() */
- if (pmd_devmap_trans_unstable(vmf->pmd)) {
- folio_unlock(folio);
- folio_put(folio);
- return true;
- }
-
return false;
}
@@ -3529,6 +3517,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+ if (!vmf->pte) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+ }
do {
again:
page = folio_file_page(folio, xas.xa_index);
@@ -3547,7 +3540,7 @@ again:
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
*/
- if (!pte_none(*vmf->pte))
+ if (!pte_none(ptep_get(vmf->pte)))
goto unlock;
/* We're about to handle the fault */
@@ -3806,7 +3799,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
/*
* Warn about a page cache invalidation failure during a direct I/O write.
*/
-void dio_warn_stale_pagecache(struct file *filp)
+static void dio_warn_stale_pagecache(struct file *filp)
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
@@ -3823,48 +3816,33 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
- ssize_t written;
- size_t write_len;
- pgoff_t end;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
- write_len = iov_iter_count(from);
- end = (pos + write_len - 1) >> PAGE_SHIFT;
+ if (mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
+ dio_warn_stale_pagecache(iocb->ki_filp);
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* If there are pages to writeback, return */
- if (filemap_range_has_page(file->f_mapping, pos,
- pos + write_len - 1))
- return -EAGAIN;
- } else {
- written = filemap_write_and_wait_range(mapping, pos,
- pos + write_len - 1);
- if (written)
- goto out;
- }
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ size_t write_len = iov_iter_count(from);
+ ssize_t written;
/*
- * After a write we want buffered reads to be sure to go to disk to get
- * the new data. We invalidate clean cached page from the region we're
- * about to write. We do this *before* the write so that we can return
- * without clobbering -EIOCBQUEUED from ->direct_IO().
- */
- written = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- /*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
+ written = kiocb_invalidate_pages(iocb, write_len);
if (written) {
if (written == -EBUSY)
return 0;
- goto out;
+ return written;
}
written = mapping->a_ops->direct_IO(iocb, from);
@@ -3886,11 +3864,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
*
* Skip invalidation for async writes or if mapping has no pages.
*/
- if (written > 0 && mapping->nrpages &&
- invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
- dio_warn_stale_pagecache(file);
-
if (written > 0) {
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+
+ kiocb_invalidate_post_direct_write(iocb, written);
pos += written;
write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -3901,7 +3879,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
if (written != -EIOCBQUEUED)
iov_iter_revert(from, write_len - iov_iter_count(from));
-out:
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
@@ -3980,7 +3957,10 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
- return written ? written : status;
+ if (!written)
+ return status;
+ iocb->ki_pos += written;
+ return written;
}
EXPORT_SYMBOL(generic_perform_write);
@@ -4009,25 +3989,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t written = 0;
- ssize_t err;
- ssize_t status;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err)
- goto out;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
- err = file_update_time(file);
- if (err)
- goto out;
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- loff_t pos, endbyte;
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
- written = generic_file_direct_write(iocb, from);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = generic_file_direct_write(iocb, from);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
@@ -4035,49 +4009,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
- if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
- goto out;
-
- pos = iocb->ki_pos;
- status = generic_perform_write(iocb, from);
- /*
- * If generic_perform_write() returned a synchronous error
- * then we want to return the number of bytes which were
- * direct-written, or the error code if that was zero. Note
- * that this differs from normal direct-io semantics, which
- * will return -EFOO even if some bytes were written.
- */
- if (unlikely(status < 0)) {
- err = status;
- goto out;
- }
- /*
- * We need to ensure that the page cache pages are written to
- * disk and invalidated to preserve the expected O_DIRECT
- * semantics.
- */
- endbyte = pos + status - 1;
- err = filemap_write_and_wait_range(mapping, pos, endbyte);
- if (err == 0) {
- iocb->ki_pos = endbyte + 1;
- written += status;
- invalidate_mapping_pages(mapping,
- pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
- } else {
- /*
- * We don't know how much we wrote, so just return
- * the number of bytes which were direct-written
- */
- }
- } else {
- written = generic_perform_write(iocb, from);
- if (likely(written > 0))
- iocb->ki_pos += written;
+ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
+ return ret;
+ return direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
}
-out:
- current->backing_dev_info = NULL;
- return written ? written : err;
+
+ return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);
@@ -4142,3 +4080,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);
+
+#ifdef CONFIG_CACHESTAT_SYSCALL
+/**
+ * filemap_cachestat() - compute the page cache statistics of a mapping
+ * @mapping: The mapping to compute the statistics for.
+ * @first_index: The starting page cache index.
+ * @last_index: The final page index (inclusive).
+ * @cs: the cachestat struct to write the result to.
+ *
+ * This will query the page cache statistics of a mapping in the
+ * page range of [first_index, last_index] (inclusive). The statistics
+ * queried include: number of dirty pages, number of pages marked for
+ * writeback, and the number of (recently) evicted pages.
+ */
+static void filemap_cachestat(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
+{
+ XA_STATE(xas, &mapping->i_pages, first_index);
+ struct folio *folio;
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_index) {
+ unsigned long nr_pages;
+ pgoff_t folio_first_index, folio_last_index;
+
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (xa_is_value(folio)) {
+ /* page is evicted */
+ void *shadow = (void *)folio;
+ bool workingset; /* not used */
+ int order = xa_get_order(xas.xa, xas.xa_index);
+
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ cs->nr_evicted += nr_pages;
+
+#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
+ if (shmem_mapping(mapping)) {
+ /* shmem file - in swap cache */
+ swp_entry_t swp = radix_to_swp_entry(folio);
+
+ shadow = get_shadow_from_swap_cache(swp);
+ }
+#endif
+ if (workingset_test_recent(shadow, true, &workingset))
+ cs->nr_recently_evicted += nr_pages;
+
+ goto resched;
+ }
+
+ nr_pages = folio_nr_pages(folio);
+ folio_first_index = folio_pgoff(folio);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ /* page is in cache */
+ cs->nr_cache += nr_pages;
+
+ if (folio_test_dirty(folio))
+ cs->nr_dirty += nr_pages;
+
+ if (folio_test_writeback(folio))
+ cs->nr_writeback += nr_pages;
+
+resched:
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * The cachestat(2) system call.
+ *
+ * cachestat() returns the page cache statistics of a file in the
+ * bytes range specified by `off` and `len`: number of cached pages,
+ * number of dirty pages, number of pages marked for writeback,
+ * number of evicted pages, and number of recently evicted pages.
+ *
+ * An evicted page is a page that is previously in the page cache
+ * but has been evicted since. A page is recently evicted if its last
+ * eviction was recent enough that its reentry to the cache would
+ * indicate that it is actively being used by the system, and that
+ * there is memory pressure on the system.
+ *
+ * `off` and `len` must be non-negative integers. If `len` > 0,
+ * the queried range is [`off`, `off` + `len`]. If `len` == 0,
+ * we will query in the range from `off` to the end of the file.
+ *
+ * The `flags` argument is unused for now, but is included for future
+ * extensibility. User should pass 0 (i.e no flag specified).
+ *
+ * Currently, hugetlbfs is not supported.
+ *
+ * Because the status of a page can change after cachestat() checks it
+ * but before it returns to the application, the returned values may
+ * contain stale information.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - cstat or cstat_range points to an illegal address
+ * -EINVAL - invalid flags
+ * -EBADF - invalid file descriptor
+ * -EOPNOTSUPP - file descriptor is of a hugetlbfs file
+ */
+SYSCALL_DEFINE4(cachestat, unsigned int, fd,
+ struct cachestat_range __user *, cstat_range,
+ struct cachestat __user *, cstat, unsigned int, flags)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ struct cachestat_range csr;
+ struct cachestat cs;
+ pgoff_t first_index, last_index;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (copy_from_user(&csr, cstat_range,
+ sizeof(struct cachestat_range))) {
+ fdput(f);
+ return -EFAULT;
+ }
+
+ /* hugetlbfs is not supported */
+ if (is_file_hugepages(f.file)) {
+ fdput(f);
+ return -EOPNOTSUPP;
+ }
+
+ if (flags != 0) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ first_index = csr.off >> PAGE_SHIFT;
+ last_index =
+ csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
+ memset(&cs, 0, sizeof(struct cachestat));
+ mapping = f.file->f_mapping;
+ filemap_cachestat(mapping, first_index, last_index, &cs);
+ fdput(f);
+
+ if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_CACHESTAT_SYSCALL */