summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 09:17:33 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 09:17:33 -0800
commit90a19b744de3a4fb51aee2edd8f2b9a4b14c9878 (patch)
tree97dc6486202ac774a0904f4ce36d152a6674a45a
parentfcc79e1714e8c2b8e216dc3149812edd37884eef (diff)
parent0bc8061ffc733a0a246b8689b2d32a3e9204f43c (diff)
Merge tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs
Pull erofs updates from Gao Xiang: "There is no outstanding feature for this cycle. The most useful changes are SEEK_{DATA,HOLE} support and some decompression micro-optimization. Other than those, there are some bugfixes and cleanups as usual: - Add SEEK_{DATA,HOLE} support - Free redundant pclusters if no cached compressed data is valid - Add sysfs entry to drop internal caches - Several bugfixes & cleanups" * tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs: erofs: handle NONHEAD !delta[1] lclusters gracefully erofs: clarify direct I/O support erofs: fix blksize < PAGE_SIZE for file-backed mounts erofs: get rid of `buf->kmap_type` erofs: fix file-backed mounts over FUSE erofs: simplify definition of the log functions erofs: add sysfs node to drop internal caches erofs: free pclusters if no cached folio is attached erofs: sunset `struct erofs_workgroup` erofs: move erofs_workgroup operations into zdata.c erofs: get rid of erofs_{find,insert}_workgroup erofs: add SEEK_{DATA,HOLE} support
-rw-r--r--Documentation/ABI/testing/sysfs-fs-erofs11
-rw-r--r--fs/erofs/data.c69
-rw-r--r--fs/erofs/inode.c12
-rw-r--r--fs/erofs/internal.h35
-rw-r--r--fs/erofs/super.c35
-rw-r--r--fs/erofs/sysfs.c17
-rw-r--r--fs/erofs/zdata.c221
-rw-r--r--fs/erofs/zmap.c17
-rw-r--r--fs/erofs/zutil.c155
9 files changed, 276 insertions, 296 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs
index 284224d1b56f..b134146d735b 100644
--- a/Documentation/ABI/testing/sysfs-fs-erofs
+++ b/Documentation/ABI/testing/sysfs-fs-erofs
@@ -16,3 +16,14 @@ Description: Control strategy of sync decompression:
readahead on atomic contexts only.
- 1 (force on): enable for readpage and readahead.
- 2 (force off): disable for all situations.
+
+What: /sys/fs/erofs/<disk>/drop_caches
+Date: November 2024
+Contact: "Guo Chunhai" <guochunhai@vivo.com>
+Description: Writing to this will drop compression-related caches,
+ currently used to drop in-memory pclusters and cached
+ compressed folios:
+
+ - 1 : invalidate cached compressed folios
+ - 2 : drop in-memory pclusters
+ - 3 : drop in-memory pclusters and cached compressed folios
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 61debd799cf9..1c49f8962021 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -10,10 +10,10 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- if (buf->kmap_type == EROFS_KMAP)
- kunmap_local(buf->base);
+ if (!buf->base)
+ return;
+ kunmap_local(buf->base);
buf->base = NULL;
- buf->kmap_type = EROFS_NO_KMAP;
}
void erofs_put_metabuf(struct erofs_buf *buf)
@@ -38,20 +38,13 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
}
if (!folio || !folio_contains(folio, index)) {
erofs_put_metabuf(buf);
- folio = read_mapping_folio(buf->mapping, index, NULL);
+ folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
}
buf->page = folio_file_page(folio, index);
-
- if (buf->kmap_type == EROFS_NO_KMAP) {
- if (type == EROFS_KMAP)
- buf->base = kmap_local_page(buf->page);
- buf->kmap_type = type;
- } else if (buf->kmap_type != type) {
- DBG_BUGON(1);
- return ERR_PTR(-EFAULT);
- }
+ if (!buf->base && type == EROFS_KMAP)
+ buf->base = kmap_local_page(buf->page);
if (type == EROFS_NO_KMAP)
return NULL;
return buf->base + (offset & ~PAGE_MASK);
@@ -61,9 +54,11 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fileio_mode(sbi))
- buf->mapping = file_inode(sbi->fdev)->i_mapping;
- else if (erofs_is_fscache_mode(sb))
+ buf->file = NULL;
+ if (erofs_is_fileio_mode(sbi)) {
+ buf->file = sbi->fdev; /* some fs like FUSE needs it */
+ buf->mapping = buf->file->f_mapping;
+ } else if (erofs_is_fscache_mode(sb))
buf->mapping = sbi->s_fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
@@ -350,7 +345,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
struct erofs_buf buf = {
.page = kmap_to_page(ptr),
.base = ptr,
- .kmap_type = EROFS_KMAP,
};
DBG_BUGON(iomap->type != IOMAP_INLINE);
@@ -411,22 +405,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = bdev_logical_block_size(bdev) - 1;
- else
- blksize_mask = i_blocksize(inode) - 1;
-
- if ((iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to)) & blksize_mask)
- return -EINVAL;
-
+ if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, NULL, 0);
- }
return filemap_read(iocb, to, 0);
}
@@ -473,8 +454,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
#define erofs_file_mmap generic_file_readonly_mmap
#endif
+static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ const struct iomap_ops *ops = &erofs_iomap_ops;
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+#ifdef CONFIG_EROFS_FS_ZIP
+ ops = &z_erofs_iomap_report_ops;
+#else
+ return generic_file_llseek(file, offset, whence);
+#endif
+
+ if (whence == SEEK_HOLE)
+ offset = iomap_seek_hole(inode, offset, ops);
+ else if (whence == SEEK_DATA)
+ offset = iomap_seek_data(inode, offset, ops);
+ else
+ return generic_file_llseek(file, offset, whence);
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct file_operations erofs_file_fops = {
- .llseek = generic_file_llseek,
+ .llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index db29190656eb..d4b89407822a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -318,6 +318,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
+ struct block_device *bdev = inode->i_sb->s_bdev;
bool compressed =
erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
@@ -330,15 +331,14 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
/*
* Return the DIO alignment restrictions if requested.
*
- * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and
- * compressed files, so in these cases we report no DIO support.
+ * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode
+ * and uncompressed inodes, otherwise we report no DIO support.
*/
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
stat->result_mask |= STATX_DIOALIGN;
- if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) {
- stat->dio_mem_align =
- bdev_logical_block_size(inode->i_sb->s_bdev);
- stat->dio_offset_align = stat->dio_mem_align;
+ if (bdev && !compressed) {
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
}
}
generic_fillattr(idmap, request_mask, inode, stat);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4efd578d7c62..1c847c30a918 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -20,18 +20,12 @@
#include <linux/iomap.h>
#include "erofs_fs.h"
-/* redefine pr_fmt "erofs: " */
-#undef pr_fmt
-#define pr_fmt(fmt) "erofs: " fmt
-
-__printf(3, 4) void _erofs_err(struct super_block *sb,
- const char *function, const char *fmt, ...);
+__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
#define erofs_err(sb, fmt, ...) \
- _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
-__printf(3, 4) void _erofs_info(struct super_block *sb,
- const char *function, const char *fmt, ...);
+ _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__)
#define erofs_info(sb, fmt, ...) \
- _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+ _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__)
+
#ifdef CONFIG_EROFS_FS_DEBUG
#define DBG_BUGON BUG_ON
#else
@@ -208,12 +202,6 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-/* basic unit of the workstation of a super_block */
-struct erofs_workgroup {
- pgoff_t index;
- struct lockref lockref;
-};
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@@ -221,9 +209,9 @@ enum erofs_kmap_type {
struct erofs_buf {
struct address_space *mapping;
+ struct file *file;
struct page *page;
void *base;
- enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
@@ -456,20 +444,17 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-void erofs_workgroup_put(struct erofs_workgroup *grp);
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index);
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+
+extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_subsystem(void);
void z_erofs_exit_subsystem(void);
-int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *z_erofs_get_gbuf(unsigned int requiredpages);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index bed3dbe5b7cb..c235a8e4315e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -18,37 +18,22 @@
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
+void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
-
if (sb)
- pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ printk("%c%cerofs (device %s): %pV",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
- pr_err("%s: %pV", func, &vaf);
- va_end(args);
-}
-
-void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (sb)
- pr_info("(device %s): %pV", sb->s_id, &vaf);
- else
- pr_info("%pV", &vaf);
+ printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
@@ -631,7 +616,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
errorfc(fc, "unsupported blksize for fscache mode");
return -EINVAL;
}
- if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_blocksize = 1 << sbi->blkszbits;
+ sb->s_blocksize_bits = sbi->blkszbits;
+ } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
errorfc(fc, "failed to set erofs blksize");
return -EINVAL;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 63cffd0fd261..19d586273b70 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -10,6 +10,7 @@
enum {
attr_feature,
+ attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
};
@@ -57,11 +58,13 @@ static struct erofs_attr erofs_attr_##_name = { \
#ifdef CONFIG_EROFS_FS_ZIP
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
static struct attribute *erofs_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
+ ATTR_LIST(drop_caches),
#endif
NULL,
};
@@ -163,6 +166,20 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
return -EINVAL;
*(bool *)ptr = !!t;
return len;
+#ifdef CONFIG_EROFS_FS_ZIP
+ case attr_drop_caches:
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t < 1 || t > 3)
+ return -EINVAL;
+
+ if (t & 2)
+ z_erofs_shrink_scan(sbi, ~0UL);
+ if (t & 1)
+ invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
+ return len;
+#endif
}
return 0;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a569ff9dfd04..01f147505487 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -44,12 +44,15 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_pcluster {
- struct erofs_workgroup obj;
struct mutex lock;
+ struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
z_erofs_next_pcluster_t next;
+ /* I: start block address of this pcluster */
+ erofs_off_t index;
+
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -108,7 +111,7 @@ struct z_erofs_decompressqueue {
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
- return !pcl->obj.index;
+ return !pcl->index;
}
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
@@ -116,7 +119,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
{
return fo->mapping == MNGD_MAPPING(sbi);
@@ -548,7 +550,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, pcl->obj.index + i);
+ page = find_get_page(mc, pcl->index + i);
if (!page) {
/* I/O is needed, no possible to decompress directly */
standalone = false;
@@ -564,13 +566,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
}
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
pcl->compressed_bvecs[i].page = page ? page : newpage;
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
continue;
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
if (page)
put_page(page);
@@ -587,11 +589,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
}
/* (erofs_shrinker) disconnect cached encoded data with pclusters */
-int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
struct folio *folio;
int i;
@@ -626,8 +626,8 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
return true;
ret = false;
- spin_lock(&pcl->obj.lockref.lock);
- if (pcl->obj.lockref.count <= 0) {
+ spin_lock(&pcl->lockref.lock);
+ if (pcl->lockref.count <= 0) {
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (; bvec < end; ++bvec) {
if (bvec->page && page_folio(bvec->page) == folio) {
@@ -638,7 +638,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
}
}
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return ret;
}
@@ -689,15 +689,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
while (fe->icur > 0) {
if (pcl->compressed_bvecs[--fe->icur].page)
continue;
pcl->compressed_bvecs[fe->icur] = *bvec;
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return 0;
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@@ -710,13 +710,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
+static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
+{
+ if (lockref_get_not_zero(&pcl->lockref))
+ return true;
+
+ spin_lock(&pcl->lockref.lock);
+ if (__lockref_is_dead(&pcl->lockref)) {
+ spin_unlock(&pcl->lockref.lock);
+ return false;
+ }
+
+ if (!pcl->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&pcl->lockref.lock);
+ return true;
+}
+
static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
bool ztailpacking = map->m_flags & EROFS_MAP_META;
- struct z_erofs_pcluster *pcl;
- struct erofs_workgroup *grp;
+ struct z_erofs_pcluster *pcl, *pre;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
@@ -730,8 +747,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- spin_lock_init(&pcl->obj.lockref.lock);
- pcl->obj.lockref.count = 1; /* one ref for this request */
+ spin_lock_init(&pcl->lockref.lock);
+ pcl->lockref.count = 1; /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
@@ -749,19 +766,26 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
- pcl->obj.index = 0; /* which indicates ztailpacking */
+ pcl->index = 0; /* which indicates ztailpacking */
} else {
- pcl->obj.index = erofs_blknr(sb, map->m_pa);
-
- grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
+ pcl->index = erofs_blknr(sb, map->m_pa);
+ while (1) {
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
+ NULL, pcl, GFP_KERNEL);
+ if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
+ xa_unlock(&sbi->managed_pslots);
+ break;
+ }
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
}
-
- if (grp != &pcl->obj) {
- fe->pcl = container_of(grp,
- struct z_erofs_pcluster, obj);
+ if (xa_is_err(pre)) {
+ err = xa_err(pre);
+ goto err_out;
+ } else if (pre) {
+ fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
@@ -781,7 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
- struct erofs_workgroup *grp = NULL;
+ struct z_erofs_pcluster *pcl = NULL;
int ret;
DBG_BUGON(fe->pcl);
@@ -789,14 +813,23 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(sb, blknr);
+ while (1) {
+ rcu_read_lock();
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
+ if (!pcl || z_erofs_get_pcluster(pcl)) {
+ DBG_BUGON(pcl && blknr != pcl->index);
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ }
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- if (grp) {
- fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl) {
+ fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@@ -851,12 +884,87 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
struct z_erofs_pcluster, rcu));
}
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl->lockref.count)
+ return false;
- call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ /*
+ * Note that all cached folios should be detached before deleted from
+ * the XArray. Otherwise some folios could be still attached to the
+ * orphan old pcluster when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_folios(sbi, pcl))
+ return false;
+
+ /*
+ * It's impossible to fail after the pcluster is freezed, but in order
+ * to avoid some race conditions, add a DBG_BUGON to observe this.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
+
+ lockref_mark_dead(&pcl->lockref);
+ return true;
+}
+
+static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ bool free;
+
+ spin_lock(&pcl->lockref.lock);
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ spin_unlock(&pcl->lockref.lock);
+ if (free) {
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ }
+ return free;
+}
+
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink)
+{
+ struct z_erofs_pcluster *pcl;
+ unsigned int freed = 0;
+ unsigned long index;
+
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, pcl) {
+ /* try to shrink each valid pcluster */
+ if (!erofs_try_to_release_pcluster(sbi, pcl))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr_shrink)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
+}
+
+static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl, bool try_free)
+{
+ bool free = false;
+
+ if (lockref_put_or_lock(&pcl->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&pcl->lockref));
+ if (!--pcl->lockref.count) {
+ if (try_free && xa_trylock(&sbi->managed_pslots)) {
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ xa_unlock(&sbi->managed_pslots);
+ }
+ atomic_long_add(!free, &erofs_global_shrink_cnt);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ if (free)
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
@@ -877,7 +985,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
* any longer if the pcluster isn't hosted by ourselves.
*/
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
- erofs_workgroup_put(&pcl->obj);
+ z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
}
@@ -1179,6 +1287,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
int i, j, jtop, err2;
struct page *page;
bool overlapped;
+ bool try_free = true;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1236,9 +1345,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
page = be->compressed_pages[i];
- if (!page ||
- erofs_folio_is_managed(sbi, page_folio(page)))
+ if (!page)
+ continue;
+ if (erofs_folio_is_managed(sbi, page_folio(page))) {
+ try_free = false;
continue;
+ }
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@@ -1284,6 +1396,11 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
mutex_unlock(&pcl->lock);
+
+ if (z_erofs_is_inline_pcluster(pcl))
+ z_erofs_free_pcluster(pcl);
+ else
+ z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
@@ -1306,10 +1423,6 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(be.pcl->next);
err = z_erofs_decompress_pcluster(&be, err) ?: err;
- if (z_erofs_is_inline_pcluster(be.pcl))
- z_erofs_free_pcluster(be.pcl);
- else
- erofs_workgroup_put(&be.pcl->obj);
}
return err;
}
@@ -1391,9 +1504,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
zbv = pcl->compressed_bvecs[nr];
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
if (!zbv.page)
goto out_allocfolio;
@@ -1455,23 +1568,23 @@ repeat:
folio_put(folio);
out_allocfolio:
page = __erofs_allocpage(&f->pagepool, gfp, true);
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
if (page)
erofs_pagepool_add(&f->pagepool, page);
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
bvec->bv_page = page;
if (!page)
return;
folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+ filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
@@ -1603,7 +1716,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->obj.index),
+ .m_pa = erofs_pos(sb, pcl->index),
};
(void)erofs_map_dev(sb, &mdev);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a076cca1f547..4535f2f0a014 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -219,7 +219,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
unsigned int amortizedshift;
erofs_off_t pos;
- if (lcn >= totalidx)
+ if (lcn >= totalidx || vi->z_logical_clusterbits > 14)
return -EINVAL;
m->lcn = lcn;
@@ -390,7 +390,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
- do {
+ while (1) {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
@@ -402,14 +402,16 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return err;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- DBG_BUGON(!m->delta[1] &&
- m->clusterofs != 1 << lclusterbits);
+ /* work around invalid d1 generated by pre-1.0 mkfs */
+ if (unlikely(!m->delta[1])) {
+ m->delta[1] = 1;
+ DBG_BUGON(1);
+ }
} else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- /* go on until the next HEAD lcluster */
if (lcn != headlcn)
- break;
+ break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
} else {
erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
@@ -418,8 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return -EOPNOTSUPP;
}
lcn += m->delta[1];
- } while (m->delta[1]);
-
+ }
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 37afe2024840..75704f58ecfa 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "internal.h"
@@ -19,13 +20,12 @@ static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
-static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
-/* protected by 'erofs_sb_list_lock' */
-static unsigned int shrinker_run_no;
+atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
-/* protects the mounted 'erofs_sb_list' */
+/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
static DEFINE_SPINLOCK(erofs_sb_list_lock);
static LIST_HEAD(erofs_sb_list);
+static unsigned int shrinker_run_no;
static struct shrinker *erofs_shrinker_info;
static unsigned int z_erofs_gbuf_id(void)
@@ -214,145 +214,6 @@ void erofs_release_pages(struct page **pagepool)
}
}
-static bool erofs_workgroup_get(struct erofs_workgroup *grp)
-{
- if (lockref_get_not_zero(&grp->lockref))
- return true;
-
- spin_lock(&grp->lockref.lock);
- if (__lockref_is_dead(&grp->lockref)) {
- spin_unlock(&grp->lockref.lock);
- return false;
- }
-
- if (!grp->lockref.count++)
- atomic_long_dec(&erofs_global_shrink_cnt);
- spin_unlock(&grp->lockref.lock);
- return true;
-}
-
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_workgroup *grp;
-
-repeat:
- rcu_read_lock();
- grp = xa_load(&sbi->managed_pslots, index);
- if (grp) {
- if (!erofs_workgroup_get(grp)) {
- /* prefer to relax rcu read side */
- rcu_read_unlock();
- goto repeat;
- }
-
- DBG_BUGON(index != grp->index);
- }
- rcu_read_unlock();
- return grp;
-}
-
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct erofs_workgroup *pre;
-
- DBG_BUGON(grp->lockref.count < 1);
-repeat:
- xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_KERNEL);
- if (pre) {
- if (xa_is_err(pre)) {
- pre = ERR_PTR(xa_err(pre));
- } else if (!erofs_workgroup_get(pre)) {
- /* try to legitimize the current in-tree one */
- xa_unlock(&sbi->managed_pslots);
- cond_resched();
- goto repeat;
- }
- grp = pre;
- }
- xa_unlock(&sbi->managed_pslots);
- return grp;
-}
-
-static void __erofs_workgroup_free(struct erofs_workgroup *grp)
-{
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
-}
-
-void erofs_workgroup_put(struct erofs_workgroup *grp)
-{
- if (lockref_put_or_lock(&grp->lockref))
- return;
-
- DBG_BUGON(__lockref_is_dead(&grp->lockref));
- if (grp->lockref.count == 1)
- atomic_long_inc(&erofs_global_shrink_cnt);
- --grp->lockref.count;
- spin_unlock(&grp->lockref.lock);
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
-{
- int free = false;
-
- spin_lock(&grp->lockref.lock);
- if (grp->lockref.count)
- goto out;
-
- /*
- * Note that all cached pages should be detached before deleted from
- * the XArray. Otherwise some cached pages could be still attached to
- * the orphan old workgroup when the new one is available in the tree.
- */
- if (erofs_try_to_free_all_cached_folios(sbi, grp))
- goto out;
-
- /*
- * It's impossible to fail after the workgroup is freezed,
- * however in order to avoid some race conditions, add a
- * DBG_BUGON to observe this in advance.
- */
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
-
- lockref_mark_dead(&grp->lockref);
- free = true;
-out:
- spin_unlock(&grp->lockref.lock);
- if (free)
- __erofs_workgroup_free(grp);
- return free;
-}
-
-static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink)
-{
- struct erofs_workgroup *grp;
- unsigned int freed = 0;
- unsigned long index;
-
- xa_lock(&sbi->managed_pslots);
- xa_for_each(&sbi->managed_pslots, index, grp) {
- /* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp))
- continue;
- xa_unlock(&sbi->managed_pslots);
-
- ++freed;
- if (!--nr_shrink)
- return freed;
- xa_lock(&sbi->managed_pslots);
- }
- xa_unlock(&sbi->managed_pslots);
- return freed;
-}
-
void erofs_shrinker_register(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -369,8 +230,8 @@ void erofs_shrinker_unregister(struct super_block *sb)
struct erofs_sb_info *const sbi = EROFS_SB(sb);
mutex_lock(&sbi->umount_mutex);
- /* clean up all remaining workgroups in memory */
- erofs_shrink_workstation(sbi, ~0UL);
+ /* clean up all remaining pclusters in memory */
+ z_erofs_shrink_scan(sbi, ~0UL);
spin_lock(&erofs_sb_list_lock);
list_del(&sbi->list);
@@ -418,9 +279,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
-
- freed += erofs_shrink_workstation(sbi, nr - freed);
-
+ freed += z_erofs_shrink_scan(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
p = p->next;