diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-21 09:17:33 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-21 09:17:33 -0800 |
commit | 90a19b744de3a4fb51aee2edd8f2b9a4b14c9878 (patch) | |
tree | 97dc6486202ac774a0904f4ce36d152a6674a45a | |
parent | fcc79e1714e8c2b8e216dc3149812edd37884eef (diff) | |
parent | 0bc8061ffc733a0a246b8689b2d32a3e9204f43c (diff) |
Merge tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs
Pull erofs updates from Gao Xiang:
"There is no outstanding feature for this cycle. The most useful
changes are SEEK_{DATA,HOLE} support and some decompression
micro-optimization. Other than those, there are some bugfixes and
cleanups as usual:
- Add SEEK_{DATA,HOLE} support
- Free redundant pclusters if no cached compressed data is valid
- Add sysfs entry to drop internal caches
- Several bugfixes & cleanups"
* tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
erofs: handle NONHEAD !delta[1] lclusters gracefully
erofs: clarify direct I/O support
erofs: fix blksize < PAGE_SIZE for file-backed mounts
erofs: get rid of `buf->kmap_type`
erofs: fix file-backed mounts over FUSE
erofs: simplify definition of the log functions
erofs: add sysfs node to drop internal caches
erofs: free pclusters if no cached folio is attached
erofs: sunset `struct erofs_workgroup`
erofs: move erofs_workgroup operations into zdata.c
erofs: get rid of erofs_{find,insert}_workgroup
erofs: add SEEK_{DATA,HOLE} support
-rw-r--r-- | Documentation/ABI/testing/sysfs-fs-erofs | 11 | ||||
-rw-r--r-- | fs/erofs/data.c | 69 | ||||
-rw-r--r-- | fs/erofs/inode.c | 12 | ||||
-rw-r--r-- | fs/erofs/internal.h | 35 | ||||
-rw-r--r-- | fs/erofs/super.c | 35 | ||||
-rw-r--r-- | fs/erofs/sysfs.c | 17 | ||||
-rw-r--r-- | fs/erofs/zdata.c | 221 | ||||
-rw-r--r-- | fs/erofs/zmap.c | 17 | ||||
-rw-r--r-- | fs/erofs/zutil.c | 155 |
9 files changed, 276 insertions, 296 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs index 284224d1b56f..b134146d735b 100644 --- a/Documentation/ABI/testing/sysfs-fs-erofs +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -16,3 +16,14 @@ Description: Control strategy of sync decompression: readahead on atomic contexts only. - 1 (force on): enable for readpage and readahead. - 2 (force off): disable for all situations. + +What: /sys/fs/erofs/<disk>/drop_caches +Date: November 2024 +Contact: "Guo Chunhai" <guochunhai@vivo.com> +Description: Writing to this will drop compression-related caches, + currently used to drop in-memory pclusters and cached + compressed folios: + + - 1 : invalidate cached compressed folios + - 2 : drop in-memory pclusters + - 3 : drop in-memory pclusters and cached compressed folios diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 61debd799cf9..1c49f8962021 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -10,10 +10,10 @@ void erofs_unmap_metabuf(struct erofs_buf *buf) { - if (buf->kmap_type == EROFS_KMAP) - kunmap_local(buf->base); + if (!buf->base) + return; + kunmap_local(buf->base); buf->base = NULL; - buf->kmap_type = EROFS_NO_KMAP; } void erofs_put_metabuf(struct erofs_buf *buf) @@ -38,20 +38,13 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, } if (!folio || !folio_contains(folio, index)) { erofs_put_metabuf(buf); - folio = read_mapping_folio(buf->mapping, index, NULL); + folio = read_mapping_folio(buf->mapping, index, buf->file); if (IS_ERR(folio)) return folio; } buf->page = folio_file_page(folio, index); - - if (buf->kmap_type == EROFS_NO_KMAP) { - if (type == EROFS_KMAP) - buf->base = kmap_local_page(buf->page); - buf->kmap_type = type; - } else if (buf->kmap_type != type) { - DBG_BUGON(1); - return ERR_PTR(-EFAULT); - } + if (!buf->base && type == EROFS_KMAP) + buf->base = kmap_local_page(buf->page); if (type == EROFS_NO_KMAP) return NULL; return buf->base + (offset & ~PAGE_MASK); @@ -61,9 +54,11 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fileio_mode(sbi)) - buf->mapping = file_inode(sbi->fdev)->i_mapping; - else if (erofs_is_fscache_mode(sb)) + buf->file = NULL; + if (erofs_is_fileio_mode(sbi)) { + buf->file = sbi->fdev; /* some fs like FUSE needs it */ + buf->mapping = buf->file->f_mapping; + } else if (erofs_is_fscache_mode(sb)) buf->mapping = sbi->s_fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; @@ -350,7 +345,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, struct erofs_buf buf = { .page = kmap_to_page(ptr), .base = ptr, - .kmap_type = EROFS_KMAP, }; DBG_BUGON(iomap->type != IOMAP_INLINE); @@ -411,22 +405,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif - if (iocb->ki_flags & IOCB_DIRECT) { - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = bdev_logical_block_size(bdev) - 1; - else - blksize_mask = i_blocksize(inode) - 1; - - if ((iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to)) & blksize_mask) - return -EINVAL; - + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, NULL, 0, NULL, 0); - } return filemap_read(iocb, to, 0); } @@ -473,8 +454,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) #define erofs_file_mmap generic_file_readonly_mmap #endif +static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + const struct iomap_ops *ops = &erofs_iomap_ops; + + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) +#ifdef CONFIG_EROFS_FS_ZIP + ops = &z_erofs_iomap_report_ops; +#else + return generic_file_llseek(file, offset, whence); +#endif + + if (whence == SEEK_HOLE) + offset = iomap_seek_hole(inode, offset, ops); + else if (whence == SEEK_DATA) + offset = iomap_seek_data(inode, offset, ops); + else + return generic_file_llseek(file, offset, whence); + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + const struct file_operations erofs_file_fops = { - .llseek = generic_file_llseek, + .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, .mmap = erofs_file_mmap, .get_unmapped_area = thp_get_unmapped_area, diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index db29190656eb..d4b89407822a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -318,6 +318,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); + struct block_device *bdev = inode->i_sb->s_bdev; bool compressed = erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout); @@ -330,15 +331,14 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, /* * Return the DIO alignment restrictions if requested. * - * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and - * compressed files, so in these cases we report no DIO support. + * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode + * and uncompressed inodes, otherwise we report no DIO support. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { stat->result_mask |= STATX_DIOALIGN; - if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) { - stat->dio_mem_align = - bdev_logical_block_size(inode->i_sb->s_bdev); - stat->dio_offset_align = stat->dio_mem_align; + if (bdev && !compressed) { + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); } } generic_fillattr(idmap, request_mask, inode, stat); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4efd578d7c62..1c847c30a918 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -20,18 +20,12 @@ #include <linux/iomap.h> #include "erofs_fs.h" -/* redefine pr_fmt "erofs: " */ -#undef pr_fmt -#define pr_fmt(fmt) "erofs: " fmt - -__printf(3, 4) void _erofs_err(struct super_block *sb, - const char *function, const char *fmt, ...); +__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); #define erofs_err(sb, fmt, ...) \ - _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) -__printf(3, 4) void _erofs_info(struct super_block *sb, - const char *function, const char *fmt, ...); + _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__) #define erofs_info(sb, fmt, ...) \ - _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) + _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__) + #ifdef CONFIG_EROFS_FS_DEBUG #define DBG_BUGON BUG_ON #else @@ -208,12 +202,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -/* basic unit of the workstation of a super_block */ -struct erofs_workgroup { - pgoff_t index; - struct lockref lockref; -}; - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ @@ -221,9 +209,9 @@ enum erofs_kmap_type { struct erofs_buf { struct address_space *mapping; + struct file *file; struct page *page; void *base; - enum erofs_kmap_type kmap_type; }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) @@ -456,20 +444,17 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -void erofs_workgroup_put(struct erofs_workgroup *grp); -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index); -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp); -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) + +extern atomic_long_t erofs_global_shrink_cnt; void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); -int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, - struct erofs_workgroup *egrp); +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); void *z_erofs_get_gbuf(unsigned int requiredpages); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index bed3dbe5b7cb..c235a8e4315e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -18,37 +18,22 @@ static struct kmem_cache *erofs_inode_cachep __read_mostly; -void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) +void _erofs_printk(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; + int level; va_start(args, fmt); - vaf.fmt = fmt; + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - if (sb) - pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); + printk("%c%cerofs (device %s): %pV", + KERN_SOH_ASCII, level, sb->s_id, &vaf); else - pr_err("%s: %pV", func, &vaf); - va_end(args); -} - -void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (sb) - pr_info("(device %s): %pV", sb->s_id, &vaf); - else - pr_info("%pV", &vaf); + printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf); va_end(args); } @@ -631,7 +616,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) errorfc(fc, "unsupported blksize for fscache mode"); return -EINVAL; } - if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + + if (erofs_is_fileio_mode(sbi)) { + sb->s_blocksize = 1 << sbi->blkszbits; + sb->s_blocksize_bits = sbi->blkszbits; + } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { errorfc(fc, "failed to set erofs blksize"); return -EINVAL; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 63cffd0fd261..19d586273b70 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -10,6 +10,7 @@ enum { attr_feature, + attr_drop_caches, attr_pointer_ui, attr_pointer_bool, }; @@ -57,11 +58,13 @@ static struct erofs_attr erofs_attr_##_name = { \ #ifdef CONFIG_EROFS_FS_ZIP EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +EROFS_ATTR_FUNC(drop_caches, 0200); #endif static struct attribute *erofs_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), + ATTR_LIST(drop_caches), #endif NULL, }; @@ -163,6 +166,20 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, return -EINVAL; *(bool *)ptr = !!t; return len; +#ifdef CONFIG_EROFS_FS_ZIP + case attr_drop_caches: + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t < 1 || t > 3) + return -EINVAL; + + if (t & 2) + z_erofs_shrink_scan(sbi, ~0UL); + if (t & 1) + invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); + return len; +#endif } return 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a569ff9dfd04..01f147505487 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,12 +44,15 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); * A: Field should be accessed / updated in atomic for parallelized code. */ struct z_erofs_pcluster { - struct erofs_workgroup obj; struct mutex lock; + struct lockref lockref; /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; + /* I: start block address of this pcluster */ + erofs_off_t index; + /* L: the maximum decompression size of this round */ unsigned int length; @@ -108,7 +111,7 @@ struct z_erofs_decompressqueue { static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) { - return !pcl->obj.index; + return !pcl->index; } static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) @@ -116,7 +119,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) { return fo->mapping == MNGD_MAPPING(sbi); @@ -548,7 +550,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, pcl->obj.index + i); + page = find_get_page(mc, pcl->index + i); if (!page) { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -564,13 +566,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); } - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); if (!pcl->compressed_bvecs[i].page) { pcl->compressed_bvecs[i].page = page ? page : newpage; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); continue; } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); if (page) put_page(page); @@ -587,11 +589,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) } /* (erofs_shrinker) disconnect cached encoded data with pclusters */ -int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) +static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct folio *folio; int i; @@ -626,8 +626,8 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) return true; ret = false; - spin_lock(&pcl->obj.lockref.lock); - if (pcl->obj.lockref.count <= 0) { + spin_lock(&pcl->lockref.lock); + if (pcl->lockref.count <= 0) { DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { @@ -638,7 +638,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) } } } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return ret; } @@ -689,15 +689,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, if (exclusive) { /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); while (fe->icur > 0) { if (pcl->compressed_bvecs[--fe->icur].page) continue; pcl->compressed_bvecs[fe->icur] = *bvec; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return 0; } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && @@ -710,13 +710,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } +static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) +{ + if (lockref_get_not_zero(&pcl->lockref)) + return true; + + spin_lock(&pcl->lockref.lock); + if (__lockref_is_dead(&pcl->lockref)) { + spin_unlock(&pcl->lockref.lock); + return false; + } + + if (!pcl->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&pcl->lockref.lock); + return true; +} + static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); bool ztailpacking = map->m_flags & EROFS_MAP_META; - struct z_erofs_pcluster *pcl; - struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl, *pre; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || @@ -730,8 +747,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - spin_lock_init(&pcl->obj.lockref.lock); - pcl->obj.lockref.count = 1; /* one ref for this request */ + spin_lock_init(&pcl->lockref.lock); + pcl->lockref.count = 1; /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -749,19 +766,26 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { - pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->index = 0; /* which indicates ztailpacking */ } else { - pcl->obj.index = erofs_blknr(sb, map->m_pa); - - grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; + pcl->index = erofs_blknr(sb, map->m_pa); + while (1) { + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + NULL, pcl, GFP_KERNEL); + if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { + xa_unlock(&sbi->managed_pslots); + break; + } + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); } - - if (grp != &pcl->obj) { - fe->pcl = container_of(grp, - struct z_erofs_pcluster, obj); + if (xa_is_err(pre)) { + err = xa_err(pre); + goto err_out; + } else if (pre) { + fe->pcl = pre; err = -EEXIST; goto err_out; } @@ -781,7 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); - struct erofs_workgroup *grp = NULL; + struct z_erofs_pcluster *pcl = NULL; int ret; DBG_BUGON(fe->pcl); @@ -789,14 +813,23 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(sb, blknr); + while (1) { + rcu_read_lock(); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + if (!pcl || z_erofs_get_pcluster(pcl)) { + DBG_BUGON(pcl && blknr != pcl->index); + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + } } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; } - if (grp) { - fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (pcl) { + fe->pcl = pcl; ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); @@ -851,12 +884,87 @@ static void z_erofs_rcu_callback(struct rcu_head *head) struct z_erofs_pcluster, rcu)); } -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + if (pcl->lockref.count) + return false; - call_rcu(&pcl->rcu, z_erofs_rcu_callback); + /* + * Note that all cached folios should be detached before deleted from + * the XArray. Otherwise some folios could be still attached to the + * orphan old pcluster when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_folios(sbi, pcl)) + return false; + + /* + * It's impossible to fail after the pcluster is freezed, but in order + * to avoid some race conditions, add a DBG_BUGON to observe this. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); + + lockref_mark_dead(&pcl->lockref); + return true; +} + +static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) +{ + bool free; + + spin_lock(&pcl->lockref.lock); + free = __erofs_try_to_release_pcluster(sbi, pcl); + spin_unlock(&pcl->lockref.lock); + if (free) { + atomic_long_dec(&erofs_global_shrink_cnt); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); + } + return free; +} + +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink) +{ + struct z_erofs_pcluster *pcl; + unsigned int freed = 0; + unsigned long index; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, pcl) { + /* try to shrink each valid pcluster */ + if (!erofs_try_to_release_pcluster(sbi, pcl)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr_shrink) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; +} + +static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, bool try_free) +{ + bool free = false; + + if (lockref_put_or_lock(&pcl->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&pcl->lockref)); + if (!--pcl->lockref.count) { + if (try_free && xa_trylock(&sbi->managed_pslots)) { + free = __erofs_try_to_release_pcluster(sbi, pcl); + xa_unlock(&sbi->managed_pslots); + } + atomic_long_add(!free, &erofs_global_shrink_cnt); + } + spin_unlock(&pcl->lockref.lock); + if (free) + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) @@ -877,7 +985,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) - erofs_workgroup_put(&pcl->obj); + z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); fe->pcl = NULL; } @@ -1179,6 +1287,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, int i, j, jtop, err2; struct page *page; bool overlapped; + bool try_free = true; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; @@ -1236,9 +1345,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { page = be->compressed_pages[i]; - if (!page || - erofs_folio_is_managed(sbi, page_folio(page))) + if (!page) + continue; + if (erofs_folio_is_managed(sbi, page_folio(page))) { + try_free = false; continue; + } (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } @@ -1284,6 +1396,11 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); mutex_unlock(&pcl->lock); + + if (z_erofs_is_inline_pcluster(pcl)) + z_erofs_free_pcluster(pcl); + else + z_erofs_put_pcluster(sbi, pcl, try_free); return err; } @@ -1306,10 +1423,6 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); err = z_erofs_decompress_pcluster(&be, err) ?: err; - if (z_erofs_is_inline_pcluster(be.pcl)) - z_erofs_free_pcluster(be.pcl); - else - erofs_workgroup_put(&be.pcl->obj); } return err; } @@ -1391,9 +1504,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); zbv = pcl->compressed_bvecs[nr]; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); if (!zbv.page) goto out_allocfolio; @@ -1455,23 +1568,23 @@ repeat: folio_put(folio); out_allocfolio: page = __erofs_allocpage(&f->pagepool, gfp, true); - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { if (page) erofs_pagepool_add(&f->pagepool, page); - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); cond_resched(); goto repeat; } pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); bvec->bv_page = page; if (!page) return; folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { + filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1603,7 +1716,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->obj.index), + .m_pa = erofs_pos(sb, pcl->index), }; (void)erofs_map_dev(sb, &mdev); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a076cca1f547..4535f2f0a014 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -219,7 +219,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, unsigned int amortizedshift; erofs_off_t pos; - if (lcn >= totalidx) + if (lcn >= totalidx || vi->z_logical_clusterbits > 14) return -EINVAL; m->lcn = lcn; @@ -390,7 +390,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; - do { + while (1) { /* handle the last EOF pcluster (no next HEAD lcluster) */ if ((lcn << lclusterbits) >= inode->i_size) { map->m_llen = inode->i_size - map->m_la; @@ -402,14 +402,16 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return err; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - DBG_BUGON(!m->delta[1] && - m->clusterofs != 1 << lclusterbits); + /* work around invalid d1 generated by pre-1.0 mkfs */ + if (unlikely(!m->delta[1])) { + m->delta[1] = 1; + DBG_BUGON(1); + } } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { - /* go on until the next HEAD lcluster */ if (lcn != headlcn) - break; + break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; } else { erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", @@ -418,8 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return -EOPNOTSUPP; } lcn += m->delta[1]; - } while (m->delta[1]); - + } map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; return 0; } diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 37afe2024840..75704f58ecfa 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2024 Alibaba Cloud */ #include "internal.h" @@ -19,13 +20,12 @@ static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages, module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444); -static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ -/* protected by 'erofs_sb_list_lock' */ -static unsigned int shrinker_run_no; +atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ -/* protects the mounted 'erofs_sb_list' */ +/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */ static DEFINE_SPINLOCK(erofs_sb_list_lock); static LIST_HEAD(erofs_sb_list); +static unsigned int shrinker_run_no; static struct shrinker *erofs_shrinker_info; static unsigned int z_erofs_gbuf_id(void) @@ -214,145 +214,6 @@ void erofs_release_pages(struct page **pagepool) } } -static bool erofs_workgroup_get(struct erofs_workgroup *grp) -{ - if (lockref_get_not_zero(&grp->lockref)) - return true; - - spin_lock(&grp->lockref.lock); - if (__lockref_is_dead(&grp->lockref)) { - spin_unlock(&grp->lockref.lock); - return false; - } - - if (!grp->lockref.count++) - atomic_long_dec(&erofs_global_shrink_cnt); - spin_unlock(&grp->lockref.lock); - return true; -} - -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_workgroup *grp; - -repeat: - rcu_read_lock(); - grp = xa_load(&sbi->managed_pslots, index); - if (grp) { - if (!erofs_workgroup_get(grp)) { - /* prefer to relax rcu read side */ - rcu_read_unlock(); - goto repeat; - } - - DBG_BUGON(index != grp->index); - } - rcu_read_unlock(); - return grp; -} - -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct erofs_workgroup *pre; - - DBG_BUGON(grp->lockref.count < 1); -repeat: - xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_KERNEL); - if (pre) { - if (xa_is_err(pre)) { - pre = ERR_PTR(xa_err(pre)); - } else if (!erofs_workgroup_get(pre)) { - /* try to legitimize the current in-tree one */ - xa_unlock(&sbi->managed_pslots); - cond_resched(); - goto repeat; - } - grp = pre; - } - xa_unlock(&sbi->managed_pslots); - return grp; -} - -static void __erofs_workgroup_free(struct erofs_workgroup *grp) -{ - atomic_long_dec(&erofs_global_shrink_cnt); - erofs_workgroup_free_rcu(grp); -} - -void erofs_workgroup_put(struct erofs_workgroup *grp) -{ - if (lockref_put_or_lock(&grp->lockref)) - return; - - DBG_BUGON(__lockref_is_dead(&grp->lockref)); - if (grp->lockref.count == 1) - atomic_long_inc(&erofs_global_shrink_cnt); - --grp->lockref.count; - spin_unlock(&grp->lockref.lock); -} - -static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) -{ - int free = false; - - spin_lock(&grp->lockref.lock); - if (grp->lockref.count) - goto out; - - /* - * Note that all cached pages should be detached before deleted from - * the XArray. Otherwise some cached pages could be still attached to - * the orphan old workgroup when the new one is available in the tree. - */ - if (erofs_try_to_free_all_cached_folios(sbi, grp)) - goto out; - - /* - * It's impossible to fail after the workgroup is freezed, - * however in order to avoid some race conditions, add a - * DBG_BUGON to observe this in advance. - */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - - lockref_mark_dead(&grp->lockref); - free = true; -out: - spin_unlock(&grp->lockref.lock); - if (free) - __erofs_workgroup_free(grp); - return free; -} - -static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink) -{ - struct erofs_workgroup *grp; - unsigned int freed = 0; - unsigned long index; - - xa_lock(&sbi->managed_pslots); - xa_for_each(&sbi->managed_pslots, index, grp) { - /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp)) - continue; - xa_unlock(&sbi->managed_pslots); - - ++freed; - if (!--nr_shrink) - return freed; - xa_lock(&sbi->managed_pslots); - } - xa_unlock(&sbi->managed_pslots); - return freed; -} - void erofs_shrinker_register(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -369,8 +230,8 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - /* clean up all remaining workgroups in memory */ - erofs_shrink_workstation(sbi, ~0UL); + /* clean up all remaining pclusters in memory */ + z_erofs_shrink_scan(sbi, ~0UL); spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); @@ -418,9 +279,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, spin_unlock(&erofs_sb_list_lock); sbi->shrinker_run_no = run_no; - - freed += erofs_shrink_workstation(sbi, nr - freed); - + freed += z_erofs_shrink_scan(sbi, nr - freed); spin_lock(&erofs_sb_list_lock); /* Get the next list element before we move this one */ p = p->next; |