summaryrefslogtreecommitdiff
path: root/fs/erofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig37
-rw-r--r--fs/erofs/Makefile6
-rw-r--r--fs/erofs/compress.h85
-rw-r--r--fs/erofs/data.c221
-rw-r--r--fs/erofs/decompressor.c160
-rw-r--r--fs/erofs/decompressor_deflate.c207
-rw-r--r--fs/erofs/decompressor_lzma.c169
-rw-r--r--fs/erofs/decompressor_zstd.c225
-rw-r--r--fs/erofs/dir.c37
-rw-r--r--fs/erofs/erofs_fs.h23
-rw-r--r--fs/erofs/fileio.c195
-rw-r--r--fs/erofs/fscache.c315
-rw-r--r--fs/erofs/inode.c181
-rw-r--r--fs/erofs/internal.h145
-rw-r--r--fs/erofs/namei.c6
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c451
-rw-r--r--fs/erofs/sysfs.c47
-rw-r--r--fs/erofs/utils.c287
-rw-r--r--fs/erofs/xattr.c41
-rw-r--r--fs/erofs/zdata.c1057
-rw-r--r--fs/erofs/zmap.c254
-rw-r--r--fs/erofs/zutil.c317
23 files changed, 2441 insertions, 2173 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index fffd3919343e..6ea60661fa55 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -74,6 +74,23 @@ config EROFS_FS_SECURITY
If you are not using a security module, say N.
+config EROFS_FS_BACKED_BY_FILE
+ bool "File-backed EROFS filesystem support"
+ depends on EROFS_FS
+ default y
+ help
+ This allows EROFS to use filesystem image files directly, without
+ the intercession of loopback block devices or likewise. It is
+ particularly useful for container images with numerous blobs and
+ other sandboxes, where loop devices behave intricately. It can also
+ be used to simplify error-prone lifetime management of unnecessary
+ virtual block devices.
+
+ Note that this feature, along with ongoing fanotify pre-content
+ hooks, will eventually replace "EROFS over fscache."
+
+ If you don't want to enable this feature, say N.
+
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
@@ -112,8 +129,23 @@ config EROFS_FS_ZIP_DEFLATE
If unsure, say N.
+config EROFS_FS_ZIP_ZSTD
+ bool "EROFS Zstandard compressed data support"
+ depends on EROFS_FS_ZIP
+ select ZSTD_DECOMPRESS
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing Zstandard compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ Zstandard support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
- bool "EROFS fscache-based on-demand read support"
+ bool "EROFS fscache-based on-demand read support (deprecated)"
depends on EROFS_FS
select NETFS_SUPPORT
select FSCACHE
@@ -123,6 +155,9 @@ config EROFS_FS_ONDEMAND
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
+ It is now deprecated and scheduled to be removed from the kernel
+ after fanotify pre-content hooks are landed.
+
If unsure, say N.
config EROFS_FS_PCPU_KTHREAD
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 994d0b9deddf..4331d53c7109 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,9 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 7cc5841577b2..65ff39401020 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -24,45 +24,23 @@ struct z_erofs_decompressor {
void *data, int size);
int (*decompress)(struct z_erofs_decompress_req *rq,
struct page **pagepool);
+ int (*init)(void);
+ void (*exit)(void);
char *name;
};
-/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
-#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
+#define Z_EROFS_PREALLOCATED_FOLIO ((void *)(-2UL << 2))
/*
- * For all pages in a pcluster, page->private should be one of
- * Type Last 2bits page->private
- * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE
- * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE
- * cached/managed page 00 pointer to z_erofs_pcluster
- * online page (file-backed, 01/10/11 sub-index << 2 | count
- * some pages can be used for inplace I/O)
- *
- * page->mapping should be one of
- * Type page->mapping
- * short-lived page NULL
- * preallocated page NULL
- * cached/managed page non-NULL or NULL (invalidated/truncated page)
- * online page non-NULL
- *
- * For all managed pages, PG_private should be set with 1 extra refcount,
- * which is used for page reclaim / migration.
- */
-
-/*
- * short-lived pages are pages directly from buddy system with specific
- * page->private (no need to set PagePrivate since these are non-LRU /
- * non-movable pages and bypass reclaim / migration code).
+ * Currently, short-lived pages are pages directly from buddy system
+ * with specific page->private (Z_EROFS_SHORTLIVED_PAGE).
+ * In the future world of Memdescs, it should be type 0 (Misc) memory
+ * which type can be checked with a new helper.
*/
static inline bool z_erofs_is_shortlived_page(struct page *page)
{
- if (page->private != Z_EROFS_SHORTLIVED_PAGE)
- return false;
-
- DBG_BUGON(page->mapping);
- return true;
+ return page->private == Z_EROFS_SHORTLIVED_PAGE;
}
static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
@@ -70,35 +48,32 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
{
if (!z_erofs_is_shortlived_page(page))
return false;
-
- /* short-lived pages should not be used by others at the same time */
- if (page_ref_count(page) > 1) {
- put_page(page);
- } else {
- /* follow the pcluster rule above. */
- erofs_pagepool_add(pagepool, page);
- }
+ erofs_pagepool_add(pagepool, page);
return true;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
+extern const struct z_erofs_decompressor z_erofs_lzma_decomp;
+extern const struct z_erofs_decompressor z_erofs_deflate_decomp;
+extern const struct z_erofs_decompressor z_erofs_zstd_decomp;
+extern const struct z_erofs_decompressor *z_erofs_decomp[];
+
+struct z_erofs_stream_dctx {
+ struct z_erofs_decompress_req *rq;
+ unsigned int inpages, outpages; /* # of {en,de}coded pages */
+ int no, ni; /* the current {en,de}coded page # */
+ unsigned int avail_out; /* remaining bytes in the decoded buffer */
+ unsigned int inbuf_pos, inbuf_sz;
+ /* current status of the encoded buffer */
+ u8 *kin, *kout; /* buffer mapped pointers */
+ void *bounce; /* bounce buffer for inplace I/Os */
+ bool bounced; /* is the bounce buffer used now? */
+};
+
+int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
+ void **src, struct page **pgpl);
int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
-extern const struct z_erofs_decompressor erofs_decompressors[];
-
-/* prototypes for specific algorithms */
-int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_load_deflate_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
-int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+int __init z_erofs_init_decompressor(void);
+void z_erofs_exit_decompressor(void);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3d9721b3faa8..0cd6b5c4df98 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -10,10 +10,10 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- if (buf->kmap_type == EROFS_KMAP)
- kunmap_local(buf->base);
+ if (!buf->base)
+ return;
+ kunmap_local(buf->base);
buf->base = NULL;
- buf->kmap_type = EROFS_NO_KMAP;
}
void erofs_put_metabuf(struct erofs_buf *buf)
@@ -21,45 +21,30 @@ void erofs_put_metabuf(struct erofs_buf *buf)
if (!buf->page)
return;
erofs_unmap_metabuf(buf);
- put_page(buf->page);
+ folio_put(page_folio(buf->page));
buf->page = NULL;
}
-/*
- * Derive the block size from inode->i_blkbits to make compatible with
- * anonymous inode in fscache mode.
- */
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type)
{
- struct inode *inode = buf->inode;
- erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits;
pgoff_t index = offset >> PAGE_SHIFT;
- struct page *page = buf->page;
- struct folio *folio;
- unsigned int nofs_flag;
+ struct folio *folio = NULL;
- if (!page || page->index != index) {
+ if (buf->page) {
+ folio = page_folio(buf->page);
+ if (folio_file_page(folio, index) != buf->page)
+ erofs_unmap_metabuf(buf);
+ }
+ if (!folio || !folio_contains(folio, index)) {
erofs_put_metabuf(buf);
-
- nofs_flag = memalloc_nofs_save();
- folio = read_cache_folio(inode->i_mapping, index, NULL, NULL);
- memalloc_nofs_restore(nofs_flag);
+ folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
-
- /* should already be PageUptodate, no need to lock page */
- page = folio_file_page(folio, index);
- buf->page = page;
- }
- if (buf->kmap_type == EROFS_NO_KMAP) {
- if (type == EROFS_KMAP)
- buf->base = kmap_local_page(page);
- buf->kmap_type = type;
- } else if (buf->kmap_type != type) {
- DBG_BUGON(1);
- return ERR_PTR(-EFAULT);
}
+ buf->page = folio_file_page(folio, index);
+ if (!buf->base && type == EROFS_KMAP)
+ buf->base = kmap_local_page(buf->page);
if (type == EROFS_NO_KMAP)
return NULL;
return buf->base + (offset & ~PAGE_MASK);
@@ -67,54 +52,50 @@ void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
- if (erofs_is_fscache_mode(sb))
- buf->inode = EROFS_SB(sb)->s_fscache->inode;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ buf->file = NULL;
+ if (erofs_is_fileio_mode(sbi)) {
+ buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
+ buf->mapping = buf->file->f_mapping;
+ } else if (erofs_is_fscache_mode(sb))
+ buf->mapping = sbi->dif0.fscache->inode->i_mapping;
else
- buf->inode = sb->s_bdev->bd_inode;
+ buf->mapping = sb->s_bdev->bd_mapping;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+ erofs_off_t offset, enum erofs_kmap_type type)
{
erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, blkaddr, type);
+ return erofs_bread(buf, offset, type);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map)
{
- erofs_blk_t nblocks, lastblk;
- u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
struct super_block *sb = inode->i_sb;
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;
- nblocks = erofs_iblks(inode);
- lastblk = nblocks - tailendpacking;
-
- /* there is no hole in flatmode */
- map->m_flags = EROFS_MAP_MAPPED;
- if (offset < erofs_pos(sb, lastblk)) {
+ map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */
+ if (map->m_la < erofs_pos(sb, lastblk)) {
map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
- map->m_plen = erofs_pos(sb, lastblk) - offset;
- } else if (tailendpacking) {
+ map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
+ } else {
+ DBG_BUGON(!tailendpacking);
map->m_pa = erofs_iloc(inode) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(sb, offset);
- map->m_plen = inode->i_size - offset;
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_plen = inode->i_size - map->m_la;
/* inline data should be located in the same meta block */
if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "inline data cross block boundary @ nid %llu",
- vi->nid);
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
map->m_flags |= EROFS_MAP_META;
- } else {
- erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx",
- vi->nid, inode->i_size, map->m_la);
- DBG_BUGON(1);
- return -EIO;
}
return 0;
}
@@ -136,7 +117,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
- map->m_plen = 0;
+ map->m_plen = map->m_llen;
goto out;
}
@@ -154,7 +135,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
+ kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
if (IS_ERR(kaddr)) {
err = PTR_ERR(kaddr);
goto out;
@@ -165,7 +146,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos);
+ __le32 *blkaddr = kaddr;
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
@@ -176,7 +157,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
goto out_unlock;
}
/* parse chunk indexes */
- idx = kaddr + erofs_blkoff(sb, pos);
+ idx = kaddr;
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
@@ -197,17 +178,25 @@ out:
return err;
}
+static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
+ struct super_block *sb, struct erofs_device_info *dif)
+{
+ map->m_sb = sb;
+ map->m_dif = dif;
+ map->m_bdev = NULL;
+ if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
+ map->m_bdev = file_bdev(dif->file);
+}
+
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
+ erofs_off_t startoff, length;
int id;
- map->m_bdev = sb->s_bdev;
- map->m_daxdev = EROFS_SB(sb)->dax_dev;
- map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
- map->m_fscache = EROFS_SB(sb)->s_fscache;
-
+ erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
+ map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */
if (map->m_deviceid) {
down_read(&devs->rwsem);
dif = idr_find(&devs->tree, map->m_deviceid - 1);
@@ -220,29 +209,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, sb, dif);
up_read(&devs->rwsem);
} else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- erofs_off_t startoff, length;
-
if (!dif->mapped_blkaddr)
continue;
+
startoff = erofs_pos(sb, dif->mapped_blkaddr);
length = erofs_pos(sb, dif->blocks);
-
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev_handle ?
- dif->bdev_handle->bdev : NULL;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, sb, dif);
break;
}
}
@@ -251,6 +231,42 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return 0;
}
+/*
+ * bit 30: I/O error occurred on this folio
+ * bit 0 - 29: remaining parts to complete this folio
+ */
+#define EROFS_ONLINEFOLIO_EIO (1 << 30)
+
+void erofs_onlinefolio_init(struct folio *folio)
+{
+ union {
+ atomic_t o;
+ void *v;
+ } u = { .o = ATOMIC_INIT(1) };
+
+ folio->private = u.v; /* valid only if file-backed folio is locked */
+}
+
+void erofs_onlinefolio_split(struct folio *folio)
+{
+ atomic_inc((atomic_t *)&folio->private);
+}
+
+void erofs_onlinefolio_end(struct folio *folio, int err)
+{
+ int orig, v;
+
+ do {
+ orig = atomic_read((atomic_t *)&folio->private);
+ v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
+ } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
+
+ if (v & ~EROFS_ONLINEFOLIO_EIO)
+ return;
+ folio->private = 0;
+ folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -276,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->offset = map.m_la;
if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_daxdev;
+ iomap->dax_dev = mdev.m_dif->dax_dev;
else
iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
@@ -296,17 +312,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb,
- erofs_blknr(sb, mdev.m_pa), EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
- iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa);
+ iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = mdev.m_pa;
if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dax_part_off;
+ iomap->addr += mdev.m_dif->dax_part_off;
}
return 0;
}
@@ -320,7 +335,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
struct erofs_buf buf = {
.page = kmap_to_page(ptr),
.base = ptr,
- .kmap_type = EROFS_KMAP,
};
DBG_BUGON(iomap->type != IOMAP_INLINE);
@@ -381,27 +395,14 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = bdev_logical_block_size(bdev) - 1;
- else
- blksize_mask = i_blocksize(inode) - 1;
-
- if ((iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to)) & blksize_mask)
- return -EINVAL;
-
+ if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, NULL, 0);
- }
return filemap_read(iocb, to, 0);
}
/* for uncompressed (aligned) files and raw access for other files */
-const struct address_space_operations erofs_raw_access_aops = {
+const struct address_space_operations erofs_aops = {
.read_folio = erofs_read_folio,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
@@ -443,8 +444,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
#define erofs_file_mmap generic_file_readonly_mmap
#endif
+static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ const struct iomap_ops *ops = &erofs_iomap_ops;
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+#ifdef CONFIG_EROFS_FS_ZIP
+ ops = &z_erofs_iomap_report_ops;
+#else
+ return generic_file_llseek(file, offset, whence);
+#endif
+
+ if (whence == SEEK_HOLE)
+ offset = iomap_seek_hole(inode, offset, ops);
+ else if (whence == SEEK_DATA)
+ offset = iomap_seek_data(inode, offset, ops);
+ else
+ return generic_file_llseek(file, offset, whence);
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct file_operations erofs_file_fops = {
- .llseek = generic_file_llseek,
+ .llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2ec9b2bb628d..2b123b070a42 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -2,18 +2,12 @@
/*
* Copyright (C) 2019 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "compress.h"
#include <linux/lz4.h>
-#ifndef LZ4_DISTANCE_MAX /* history window size */
-#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
-#endif
-
#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
-#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
-#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
-#endif
struct z_erofs_lz4_decompress_ctx {
struct z_erofs_decompress_req *rq;
@@ -54,7 +48,7 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
sbi->lz4.max_distance_pages = distance ?
DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
LZ4_MAX_DISTANCE_PAGES;
- return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+ return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
}
/*
@@ -109,9 +103,8 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
if (top) {
victim = availables[--top];
- get_page(victim);
} else {
- victim = erofs_allocpage(pagepool, rq->gfp);
+ victim = __erofs_allocpage(pagepool, rq->gfp, true);
if (!victim)
return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
@@ -159,7 +152,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = erofs_get_pcpubuf(ctx->inpages);
+ src = z_erofs_get_gbuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_local(inpage);
@@ -260,7 +253,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
} else if (maptype == 1) {
vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
- erofs_put_pcpubuf(src);
+ z_erofs_put_gbuf(src);
} else if (maptype != 3) {
DBG_BUGON(1);
return -EFAULT;
@@ -371,33 +364,113 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
return 0;
}
-const struct z_erofs_decompressor erofs_decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
+int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
+ void **src, struct page **pgpl)
+{
+ struct z_erofs_decompress_req *rq = dctx->rq;
+ struct super_block *sb = rq->sb;
+ struct page **pgo, *tmppage;
+ unsigned int j;
+
+ if (!dctx->avail_out) {
+ if (++dctx->no >= dctx->outpages || !rq->outputsize) {
+ erofs_err(sb, "insufficient space for decompressed data");
+ return -EFSCORRUPTED;
+ }
+
+ if (dctx->kout)
+ kunmap_local(dctx->kout);
+ dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out);
+ rq->outputsize -= dctx->avail_out;
+ pgo = &rq->out[dctx->no];
+ if (!*pgo && rq->fillgaps) { /* deduped */
+ *pgo = erofs_allocpage(pgpl, rq->gfp);
+ if (!*pgo) {
+ dctx->kout = NULL;
+ return -ENOMEM;
+ }
+ set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE);
+ }
+ if (*pgo) {
+ dctx->kout = kmap_local_page(*pgo);
+ *dst = dctx->kout + rq->pageofs_out;
+ } else {
+ *dst = dctx->kout = NULL;
+ }
+ rq->pageofs_out = 0;
+ }
+
+ if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
+ if (++dctx->ni >= dctx->inpages) {
+ erofs_err(sb, "invalid compressed data");
+ return -EFSCORRUPTED;
+ }
+ if (dctx->kout) /* unlike kmap(), take care of the orders */
+ kunmap_local(dctx->kout);
+ kunmap_local(dctx->kin);
+
+ dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE);
+ rq->inputsize -= dctx->inbuf_sz;
+ dctx->kin = kmap_local_page(rq->in[dctx->ni]);
+ *src = dctx->kin;
+ dctx->bounced = false;
+ if (dctx->kout) {
+ j = (u8 *)*dst - dctx->kout;
+ dctx->kout = kmap_local_page(rq->out[dctx->no]);
+ *dst = dctx->kout + j;
+ }
+ dctx->inbuf_pos = 0;
+ }
+
+ /*
+ * Handle overlapping: Use the given bounce buffer if the input data is
+ * under processing; Or utilize short-lived pages from the on-stack page
+ * pool, where pages are shared among the same request. Note that only
+ * a few inplace I/O pages need to be doubled.
+ */
+ if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) {
+ memcpy(dctx->bounce, *src, dctx->inbuf_sz);
+ *src = dctx->bounce;
+ dctx->bounced = true;
+ }
+
+ for (j = dctx->ni + 1; j < dctx->inpages; ++j) {
+ if (rq->out[dctx->no] != rq->in[j])
+ continue;
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage)
+ return -ENOMEM;
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ return 0;
+}
+
+const struct z_erofs_decompressor *z_erofs_decomp[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "shifted"
},
- [Z_EROFS_COMPRESSION_INTERLACED] = {
+ [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "interlaced"
},
- [Z_EROFS_COMPRESSION_LZ4] = {
+ [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) {
.config = z_erofs_load_lz4_config,
.decompress = z_erofs_lz4_decompress,
+ .init = z_erofs_gbuf_init,
+ .exit = z_erofs_gbuf_exit,
.name = "lz4"
},
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
- [Z_EROFS_COMPRESSION_LZMA] = {
- .config = z_erofs_load_lzma_config,
- .decompress = z_erofs_lzma_decompress,
- .name = "lzma"
- },
+ [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp,
#endif
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
- [Z_EROFS_COMPRESSION_DEFLATE] = {
- .config = z_erofs_load_deflate_config,
- .decompress = z_erofs_deflate_decompress,
- .name = "deflate"
- },
+ [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp,
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ZSTD
+ [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp,
#endif
};
@@ -425,6 +498,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
alg = 0;
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ const struct z_erofs_decompressor *dec = z_erofs_decomp[alg];
void *data;
if (!(algs & 1))
@@ -436,16 +510,13 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
break;
}
- if (alg >= ARRAY_SIZE(erofs_decompressors) ||
- !erofs_decompressors[alg].config) {
+ if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) {
+ ret = dec->config(sb, dsb, data, size);
+ } else {
erofs_err(sb, "algorithm %d isn't enabled on this kernel",
alg);
ret = -EOPNOTSUPP;
- } else {
- ret = erofs_decompressors[alg].config(sb,
- dsb, data, size);
}
-
kfree(data);
if (ret)
break;
@@ -453,3 +524,28 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
erofs_put_metabuf(&buf);
return ret;
}
+
+int __init z_erofs_init_decompressor(void)
+{
+ int i, err;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) {
+ err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0;
+ if (err) {
+ while (i--)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+ return err;
+ }
+ }
+ return 0;
+}
+
+void z_erofs_exit_decompressor(void)
+{
+ int i;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index b98872058abe..5070d2fcc737 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -15,7 +15,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
-void z_erofs_deflate_exit(void)
+static void z_erofs_deflate_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_deflate_avail_strms) {
@@ -41,44 +41,20 @@ void z_erofs_deflate_exit(void)
}
}
-int __init z_erofs_deflate_init(void)
+static int __init z_erofs_deflate_init(void)
{
/* by default, use # of possible CPUs instead */
if (!z_erofs_deflate_nstrms)
z_erofs_deflate_nstrms = num_possible_cpus();
-
- for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
- ++z_erofs_deflate_avail_strms) {
- struct z_erofs_deflate *strm;
-
- strm = kzalloc(sizeof(*strm), GFP_KERNEL);
- if (!strm)
- goto out_failed;
-
- /* XXX: in-kernel zlib cannot shrink windowbits currently */
- strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
- if (!strm->z.workspace) {
- kfree(strm);
- goto out_failed;
- }
-
- spin_lock(&z_erofs_deflate_lock);
- strm->next = z_erofs_deflate_head;
- z_erofs_deflate_head = strm;
- spin_unlock(&z_erofs_deflate_lock);
- }
return 0;
-
-out_failed:
- erofs_err(NULL, "failed to allocate zlib workspace");
- z_erofs_deflate_exit();
- return -ENOMEM;
}
-int z_erofs_load_deflate_config(struct super_block *sb,
+static int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
struct z_erofs_deflate_cfgs *dfl = data;
+ static DEFINE_MUTEX(deflate_resize_mutex);
+ static bool inited;
if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
erofs_err(sb, "invalid deflate cfgs, size=%u", size);
@@ -89,32 +65,58 @@ int z_erofs_load_deflate_config(struct super_block *sb,
erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
return -EOPNOTSUPP;
}
+ mutex_lock(&deflate_resize_mutex);
+ if (!inited) {
+ for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
+ ++z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm)
+ goto failed;
+ /* XXX: in-kernel zlib cannot customize windowbits */
+ strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!strm->z.workspace) {
+ kfree(strm);
+ goto failed;
+ }
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ }
+ inited = true;
+ }
+ mutex_unlock(&deflate_resize_mutex);
erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
return 0;
+failed:
+ mutex_unlock(&deflate_resize_mutex);
+ z_erofs_deflate_exit();
+ return -ENOMEM;
}
-int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
struct super_block *sb = rq->sb;
- unsigned int insz, outsz, pofs;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
struct z_erofs_deflate *strm;
- u8 *kin, *kout = NULL;
- bool bounced = false;
- int no = -1, ni = 0, j = 0, zerr, err;
+ int zerr, err;
/* 1. get the exact DEFLATE compressed size */
- kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
return err;
}
@@ -131,105 +133,35 @@ again:
spin_unlock(&z_erofs_deflate_lock);
/* 3. multi-call decompress */
- insz = rq->inputsize;
- outsz = rq->outputsize;
zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
if (zerr != Z_OK) {
err = -EIO;
goto failed_zinit;
}
- pofs = rq->pageofs_out;
- strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
- insz -= strm->z.avail_in;
- strm->z.next_in = kin + rq->pageofs_in;
+ rq->fillgaps = true; /* DEFLATE doesn't support NULL output buffer */
+ strm->z.avail_in = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= strm->z.avail_in;
+ strm->z.next_in = dctx.kin + rq->pageofs_in;
strm->z.avail_out = 0;
+ dctx.bounce = strm->bounce;
while (1) {
- if (!strm->z.avail_out) {
- if (++no >= nrpages_out || !outsz) {
- erofs_err(sb, "insufficient space for decompressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout)
- kunmap_local(kout);
- strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
- outsz -= strm->z.avail_out;
- if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
- if (!rq->out[no]) {
- kout = NULL;
- err = -ENOMEM;
- break;
- }
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- kout = kmap_local_page(rq->out[no]);
- strm->z.next_out = kout + pofs;
- pofs = 0;
- }
-
- if (!strm->z.avail_in && insz) {
- if (++ni >= nrpages_in) {
- erofs_err(sb, "invalid compressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout) { /* unlike kmap(), take care of the orders */
- j = strm->z.next_out - kout;
- kunmap_local(kout);
- }
- kunmap_local(kin);
- strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
- insz -= strm->z.avail_in;
- kin = kmap_local_page(rq->in[ni]);
- strm->z.next_in = kin;
- bounced = false;
- if (kout) {
- kout = kmap_local_page(rq->out[no]);
- strm->z.next_out = kout + j;
- }
- }
-
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Or use short-lived pages from the
- * on-stack pagepool where pages share among the same request
- * and not _all_ inplace I/O pages are needed to be doubled.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
- strm->z.next_in = strm->bounce;
- bounced = true;
- }
-
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
-
- if (rq->out[no] != rq->in[j])
- continue;
-
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
- rq->in[j]));
- tmppage = erofs_allocpage(pgpl, rq->gfp);
- if (!tmppage) {
- err = -ENOMEM;
- goto failed;
- }
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
+ dctx.avail_out = strm->z.avail_out;
+ dctx.inbuf_sz = strm->z.avail_in;
+ err = z_erofs_stream_switch_bufs(&dctx,
+ (void **)&strm->z.next_out,
+ (void **)&strm->z.next_in, pgpl);
+ if (err)
+ break;
+ strm->z.avail_out = dctx.avail_out;
+ strm->z.avail_in = dctx.inbuf_sz;
zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
- if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
+ if (zerr != Z_OK || !(rq->outputsize + strm->z.avail_out)) {
if (zerr == Z_OK && rq->partial_decoding)
break;
- if (zerr == Z_STREAM_END && !outsz)
+ if (zerr == Z_STREAM_END && !rq->outputsize)
break;
erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
zerr, rq->inputsize, rq->outputsize);
@@ -237,13 +169,12 @@ again:
break;
}
}
-failed:
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
- if (kout)
- kunmap_local(kout);
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
failed_zinit:
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
/* 4. push back DEFLATE stream context to the global list */
spin_lock(&z_erofs_deflate_lock);
strm->next = z_erofs_deflate_head;
@@ -252,3 +183,11 @@ failed_zinit:
wake_up(&z_erofs_deflate_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_deflate_decomp = {
+ .config = z_erofs_load_deflate_config,
+ .decompress = z_erofs_deflate_decompress,
+ .init = z_erofs_deflate_init,
+ .exit = z_erofs_deflate_exit,
+ .name = "deflate",
+};
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 6ca357d83cfa..40666815046f 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -5,7 +5,6 @@
struct z_erofs_lzma {
struct z_erofs_lzma *next;
struct xz_dec_microlzma *state;
- struct xz_buf buf;
u8 bounce[PAGE_SIZE];
};
@@ -18,7 +17,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
-void z_erofs_lzma_exit(void)
+static void z_erofs_lzma_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_lzma_avail_strms) {
@@ -46,7 +45,7 @@ void z_erofs_lzma_exit(void)
}
}
-int __init z_erofs_lzma_init(void)
+static int __init z_erofs_lzma_init(void)
{
unsigned int i;
@@ -70,7 +69,7 @@ int __init z_erofs_lzma_init(void)
return 0;
}
-int z_erofs_load_lzma_config(struct super_block *sb,
+static int z_erofs_load_lzma_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
static DEFINE_MUTEX(lzma_resize_mutex);
@@ -147,26 +146,28 @@ again:
return err;
}
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- unsigned int inlen, outlen, pageofs;
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
+ struct xz_buf buf = {};
struct z_erofs_lzma *strm;
- u8 *kin;
- bool bounced = false;
- int no, ni, j, err = 0;
+ enum xz_ret xz_err;
+ int err;
/* 1. get the exact LZMA compressed size */
- kin = kmap(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- rq->sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap(*rq->in);
+ kunmap_local(dctx.kin);
return err;
}
@@ -183,111 +184,45 @@ again:
spin_unlock(&z_erofs_lzma_lock);
/* 3. multi-call decompress */
- inlen = rq->inputsize;
- outlen = rq->outputsize;
- xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ xz_dec_microlzma_reset(strm->state, rq->inputsize, rq->outputsize,
!rq->partial_decoding);
- pageofs = rq->pageofs_out;
- strm->buf.in = kin + rq->pageofs_in;
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
- inlen -= strm->buf.in_size;
- strm->buf.out = NULL;
- strm->buf.out_pos = 0;
- strm->buf.out_size = 0;
-
- for (ni = 0, no = -1;;) {
- enum xz_ret xz_err;
-
- if (strm->buf.out_pos == strm->buf.out_size) {
- if (strm->buf.out) {
- kunmap(rq->out[no]);
- strm->buf.out = NULL;
- }
-
- if (++no >= nrpages_out || !outlen) {
- erofs_err(rq->sb, "decompressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.out_pos = 0;
- strm->buf.out_size = min_t(u32, outlen,
- PAGE_SIZE - pageofs);
- outlen -= strm->buf.out_size;
- if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
- if (!rq->out[no]) {
- err = -ENOMEM;
- break;
- }
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- if (rq->out[no])
- strm->buf.out = kmap(rq->out[no]) + pageofs;
- pageofs = 0;
- } else if (strm->buf.in_pos == strm->buf.in_size) {
- kunmap(rq->in[ni]);
-
- if (++ni >= nrpages_in || !inlen) {
- erofs_err(rq->sb, "compressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
- inlen -= strm->buf.in_size;
- kin = kmap(rq->in[ni]);
- strm->buf.in = kin;
- bounced = false;
- }
+ buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= buf.in_size;
+ buf.in = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+ do {
+ dctx.avail_out = buf.out_size - buf.out_pos;
+ dctx.inbuf_sz = buf.in_size;
+ dctx.inbuf_pos = buf.in_pos;
+ err = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out,
+ (void **)&buf.in, pgpl);
+ if (err)
+ break;
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Otherwise, Use short-lived pages
- * from the on-stack pagepool where pages share with the same
- * request.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
- strm->buf.in = strm->bounce;
- bounced = true;
+ if (buf.out_size == buf.out_pos) {
+ buf.out_size = dctx.avail_out;
+ buf.out_pos = 0;
}
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
-
- if (rq->out[no] != rq->in[j])
- continue;
+ buf.in_size = dctx.inbuf_sz;
+ buf.in_pos = dctx.inbuf_pos;
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
- rq->in[j]));
- tmppage = erofs_allocpage(pgpl, rq->gfp);
- if (!tmppage) {
- err = -ENOMEM;
- goto failed;
- }
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
- xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
- DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
- DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+ xz_err = xz_dec_microlzma_run(strm->state, &buf);
+ DBG_BUGON(buf.out_pos > buf.out_size);
+ DBG_BUGON(buf.in_pos > buf.in_size);
if (xz_err != XZ_OK) {
- if (xz_err == XZ_STREAM_END && !outlen)
+ if (xz_err == XZ_STREAM_END && !rq->outputsize)
break;
- erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
xz_err, rq->inputsize, rq->outputsize);
err = -EFSCORRUPTED;
break;
}
- }
-failed:
- if (no < nrpages_out && strm->buf.out)
- kunmap(rq->out[no]);
- if (ni < nrpages_in)
- kunmap(rq->in[ni]);
+ } while (1);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+ kunmap_local(dctx.kin);
/* 4. push back LZMA stream context to the global list */
spin_lock(&z_erofs_lzma_lock);
strm->next = z_erofs_lzma_head;
@@ -296,3 +231,11 @@ failed:
wake_up(&z_erofs_lzma_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_lzma_decomp = {
+ .config = z_erofs_load_lzma_config,
+ .decompress = z_erofs_lzma_decompress,
+ .init = z_erofs_lzma_init,
+ .exit = z_erofs_lzma_exit,
+ .name = "lzma"
+};
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
new file mode 100644
index 000000000000..7e177304967e
--- /dev/null
+++ b/fs/erofs/decompressor_zstd.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/zstd.h>
+#include "compress.h"
+
+struct z_erofs_zstd {
+ struct z_erofs_zstd *next;
+ u8 bounce[PAGE_SIZE];
+ void *wksp;
+ unsigned int wkspsz;
+};
+
+static DEFINE_SPINLOCK(z_erofs_zstd_lock);
+static unsigned int z_erofs_zstd_max_dictsize;
+static unsigned int z_erofs_zstd_nstrms, z_erofs_zstd_avail_strms;
+static struct z_erofs_zstd *z_erofs_zstd_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_zstd_wq);
+
+module_param_named(zstd_streams, z_erofs_zstd_nstrms, uint, 0444);
+
+static struct z_erofs_zstd *z_erofs_isolate_strms(bool all)
+{
+ struct z_erofs_zstd *strm;
+
+again:
+ spin_lock(&z_erofs_zstd_lock);
+ strm = z_erofs_zstd_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_zstd_lock);
+ wait_event(z_erofs_zstd_wq, READ_ONCE(z_erofs_zstd_head));
+ goto again;
+ }
+ z_erofs_zstd_head = all ? NULL : strm->next;
+ spin_unlock(&z_erofs_zstd_lock);
+ return strm;
+}
+
+static void z_erofs_zstd_exit(void)
+{
+ while (z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *strm, *n;
+
+ for (strm = z_erofs_isolate_strms(true); strm; strm = n) {
+ n = strm->next;
+
+ kvfree(strm->wksp);
+ kfree(strm);
+ --z_erofs_zstd_avail_strms;
+ }
+ }
+}
+
+static int __init z_erofs_zstd_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_zstd_nstrms)
+ z_erofs_zstd_nstrms = num_possible_cpus();
+
+ for (; z_erofs_zstd_avail_strms < z_erofs_zstd_nstrms;
+ ++z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm) {
+ z_erofs_zstd_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_zstd_lock);
+ strm->next = z_erofs_zstd_head;
+ z_erofs_zstd_head = strm;
+ spin_unlock(&z_erofs_zstd_lock);
+ }
+ return 0;
+}
+
+static int z_erofs_load_zstd_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ static DEFINE_MUTEX(zstd_resize_mutex);
+ struct z_erofs_zstd_cfgs *zstd = data;
+ unsigned int dict_size, wkspsz;
+ struct z_erofs_zstd *strm, *head = NULL;
+ void *wksp;
+
+ if (!zstd || size < sizeof(struct z_erofs_zstd_cfgs) || zstd->format) {
+ erofs_err(sb, "unsupported zstd format, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (zstd->windowlog > ilog2(Z_EROFS_ZSTD_MAX_DICT_SIZE) - 10) {
+ erofs_err(sb, "unsupported zstd window log %u", zstd->windowlog);
+ return -EINVAL;
+ }
+ dict_size = 1U << (zstd->windowlog + 10);
+
+ /* in case 2 z_erofs_load_zstd_config() race to avoid deadlock */
+ mutex_lock(&zstd_resize_mutex);
+ if (z_erofs_zstd_max_dictsize >= dict_size) {
+ mutex_unlock(&zstd_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ while (z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *n;
+
+ for (strm = z_erofs_isolate_strms(true); strm; strm = n) {
+ n = strm->next;
+ strm->next = head;
+ head = strm;
+ --z_erofs_zstd_avail_strms;
+ }
+ }
+
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ wkspsz = zstd_dstream_workspace_bound(dict_size);
+ for (strm = head; strm; strm = strm->next) {
+ wksp = kvmalloc(wkspsz, GFP_KERNEL);
+ if (!wksp)
+ break;
+ kvfree(strm->wksp);
+ strm->wksp = wksp;
+ strm->wkspsz = wkspsz;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_zstd_lock);
+ DBG_BUGON(z_erofs_zstd_head);
+ z_erofs_zstd_head = head;
+ spin_unlock(&z_erofs_zstd_lock);
+ z_erofs_zstd_avail_strms = z_erofs_zstd_nstrms;
+ wake_up_all(&z_erofs_zstd_wq);
+ if (!strm)
+ z_erofs_zstd_max_dictsize = dict_size;
+ mutex_unlock(&zstd_resize_mutex);
+ return strm ? -ENOMEM : 0;
+}
+
+static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
+ zstd_in_buffer in_buf = { NULL, 0, 0 };
+ zstd_out_buffer out_buf = { NULL, 0, 0 };
+ struct z_erofs_zstd *strm;
+ zstd_dstream *stream;
+ int zerr, err;
+
+ /* 1. get the exact compressed size */
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
+ if (err) {
+ kunmap_local(dctx.kin);
+ return err;
+ }
+
+ /* 2. get an available ZSTD context */
+ strm = z_erofs_isolate_strms(false);
+
+ /* 3. multi-call decompress */
+ stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz);
+ if (!stream) {
+ err = -EIO;
+ goto failed_zinit;
+ }
+
+ rq->fillgaps = true; /* ZSTD doesn't support NULL output buffer */
+ in_buf.size = min_t(u32, rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= in_buf.size;
+ in_buf.src = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+
+ do {
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ dctx.inbuf_sz = in_buf.size;
+ dctx.inbuf_pos = in_buf.pos;
+ err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
+ (void **)&in_buf.src, pgpl);
+ if (err)
+ break;
+
+ if (out_buf.size == out_buf.pos) {
+ out_buf.size = dctx.avail_out;
+ out_buf.pos = 0;
+ }
+ in_buf.size = dctx.inbuf_sz;
+ in_buf.pos = dctx.inbuf_pos;
+
+ zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
+ if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) {
+ erofs_err(sb, "failed to decompress in[%u] out[%u]: %s",
+ rq->inputsize, rq->outputsize,
+ zerr ? zstd_get_error_name(zerr) : "unexpected end of stream");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ } while (rq->outputsize || out_buf.pos < out_buf.size);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+failed_zinit:
+ kunmap_local(dctx.kin);
+ /* 4. push back ZSTD stream context to the global list */
+ spin_lock(&z_erofs_zstd_lock);
+ strm->next = z_erofs_zstd_head;
+ z_erofs_zstd_head = strm;
+ spin_unlock(&z_erofs_zstd_lock);
+ wake_up(&z_erofs_zstd_wq);
+ return err;
+}
+
+const struct z_erofs_decompressor z_erofs_zstd_decomp = {
+ .config = z_erofs_load_zstd_config,
+ .decompress = z_erofs_zstd_decompress,
+ .init = z_erofs_zstd_init,
+ .exit = z_erofs_zstd_exit,
+ .name = "zstd",
+};
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index b80abec0531a..c3b90abdee37 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -8,19 +8,15 @@
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
void *dentry_blk, struct erofs_dirent *de,
- unsigned int nameoff, unsigned int maxsize)
+ unsigned int nameoff0, unsigned int maxsize)
{
- const struct erofs_dirent *end = dentry_blk + nameoff;
+ const struct erofs_dirent *end = dentry_blk + nameoff0;
while (de < end) {
- const char *de_name;
+ unsigned char d_type = fs_ftype_to_dtype(de->file_type);
+ unsigned int nameoff = le16_to_cpu(de->nameoff);
+ const char *de_name = (char *)dentry_blk + nameoff;
unsigned int de_namelen;
- unsigned char d_type;
-
- d_type = fs_ftype_to_dtype(de->file_type);
-
- nameoff = le16_to_cpu(de->nameoff);
- de_name = (char *)dentry_blk + nameoff;
/* the last dirent in the block? */
if (de + 1 >= end)
@@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = dir->i_sb;
unsigned long bsz = sb->s_blocksize;
- const size_t dirsize = i_size_read(dir);
- unsigned int i = erofs_blknr(sb, ctx->pos);
unsigned int ofs = erofs_blkoff(sb, ctx->pos);
int err = 0;
bool initial = true;
- buf.inode = dir;
- while (ctx->pos < dirsize) {
+ buf.mapping = dir->i_mapping;
+ while (ctx->pos < dir->i_size) {
+ erofs_off_t dbstart = ctx->pos - ofs;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, i, EROFS_KMAP);
+ de = erofs_bread(&buf, dbstart, EROFS_KMAP);
if (IS_ERR(de)) {
erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
- i, EROFS_I(dir)->nid);
+ erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
}
@@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
break;
}
- maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz);
-
+ maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
-
ofs = roundup(ofs, sizeof(struct erofs_dirent));
- ctx->pos = erofs_pos(sb, i) + ofs;
- if (ofs >= nameoff)
- goto skip_this;
+ ctx->pos = dbstart + ofs;
}
err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
if (err)
break;
-skip_this:
- ctx->pos = erofs_pos(sb, i) + maxsize;
- ++i;
+ ctx->pos = dbstart + maxsize;
ofs = 0;
}
erofs_put_metabuf(&buf);
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index a03ec70ba6f2..199395ed1c1f 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -9,6 +9,7 @@
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
+/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
@@ -54,7 +55,7 @@ struct erofs_deviceslot {
/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
- __le32 checksum; /* crc32c(super_block) */
+ __le32 checksum; /* crc32c to avoid unexpected on-disk overlap */
__le32 feature_compat;
__u8 blkszbits; /* filesystem block size in bit shift */
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
@@ -288,14 +289,18 @@ struct erofs_dirent {
#define EROFS_NAME_LEN 255
-/* maximum supported size of a physical compression cluster */
+/* maximum supported encoded size of a physical compressed cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+/* maximum supported decoded size of a physical compressed cluster */
+#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024)
+
/* available compression algorithm types (for h_algorithmtype) */
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_DEFLATE = 2,
+ Z_EROFS_COMPRESSION_ZSTD = 3,
Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
@@ -322,6 +327,15 @@ struct z_erofs_deflate_cfgs {
u8 reserved[5];
} __packed;
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_zstd_cfgs {
+ u8 format;
+ u8 windowlog; /* windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN(10) */
+ u8 reserved[4];
+} __packed;
+
+#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
@@ -396,8 +410,7 @@ enum {
Z_EROFS_LCLUSTER_TYPE_MAX
};
-#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2
-#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0
+#define Z_EROFS_LI_LCLUSTER_TYPE_MASK (Z_EROFS_LCLUSTER_TYPE_MAX - 1)
/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
@@ -451,8 +464,6 @@ static inline void erofs_check_ondisk_layout_definitions(void)
sizeof(struct z_erofs_lcluster_index));
BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
- BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) <
- Z_EROFS_LCLUSTER_TYPE_MAX - 1);
/* exclude old compiler versions like gcc 7.5.0 */
BUILD_BUG_ON(__builtin_constant_p(fmh) ?
fmh != cpu_to_le64(1ULL << 63) : 0);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
new file mode 100644
index 000000000000..0ffd1c63beeb
--- /dev/null
+++ b/fs/erofs/fileio.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include "internal.h"
+#include <trace/events/erofs.h>
+
+struct erofs_fileio_rq {
+ struct bio_vec bvecs[16];
+ struct bio bio;
+ struct kiocb iocb;
+ struct super_block *sb;
+};
+
+struct erofs_fileio {
+ struct erofs_map_blocks map;
+ struct erofs_map_dev dev;
+ struct erofs_fileio_rq *rq;
+};
+
+static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
+{
+ struct erofs_fileio_rq *rq =
+ container_of(iocb, struct erofs_fileio_rq, iocb);
+ struct folio_iter fi;
+
+ if (ret > 0) {
+ if (ret != rq->bio.bi_iter.bi_size) {
+ bio_advance(&rq->bio, ret);
+ zero_fill_bio(&rq->bio);
+ }
+ ret = 0;
+ }
+ if (rq->bio.bi_end_io) {
+ rq->bio.bi_end_io(&rq->bio);
+ } else {
+ bio_for_each_folio_all(fi, &rq->bio) {
+ DBG_BUGON(folio_test_uptodate(fi.folio));
+ erofs_onlinefolio_end(fi.folio, ret);
+ }
+ }
+ bio_uninit(&rq->bio);
+ kfree(rq);
+}
+
+static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
+{
+ struct iov_iter iter;
+ int ret;
+
+ if (!rq)
+ return;
+ rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ rq->iocb.ki_ioprio = get_current_ioprio();
+ rq->iocb.ki_complete = erofs_fileio_ki_complete;
+ if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) &&
+ rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT)
+ rq->iocb.ki_flags = IOCB_DIRECT;
+ iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
+ rq->bio.bi_iter.bi_size);
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ if (ret != -EIOCBQUEUED)
+ erofs_fileio_ki_complete(&rq->iocb, ret);
+}
+
+static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
+{
+ struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ);
+ rq->iocb.ki_filp = mdev->m_dif->file;
+ rq->sb = mdev->m_sb;
+ return rq;
+}
+
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev)
+{
+ return &erofs_fileio_rq_alloc(mdev)->bio;
+}
+
+void erofs_fileio_submit_bio(struct bio *bio)
+{
+ return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq,
+ bio));
+}
+
+static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
+{
+ struct inode *inode = folio_inode(folio);
+ struct erofs_map_blocks *map = &io->map;
+ unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
+ loff_t pos = folio_pos(folio), ofs;
+ struct iov_iter iter;
+ struct bio_vec bv;
+ int err = 0;
+
+ erofs_onlinefolio_init(folio);
+ while (cur < end) {
+ if (!in_range(pos + cur, map->m_la, map->m_llen)) {
+ map->m_la = pos + cur;
+ map->m_llen = end - cur;
+ err = erofs_map_blocks(inode, map);
+ if (err)
+ break;
+ }
+
+ ofs = folio_pos(folio) + cur - map->m_la;
+ len = min_t(loff_t, map->m_llen - ofs, end - cur);
+ if (map->m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *src;
+
+ src = erofs_read_metabuf(&buf, inode->i_sb,
+ map->m_pa + ofs, EROFS_KMAP);
+ if (IS_ERR(src)) {
+ err = PTR_ERR(src);
+ break;
+ }
+ bvec_set_folio(&bv, folio, len, cur);
+ iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
+ if (copy_to_iter(src, len, &iter) != len) {
+ erofs_put_metabuf(&buf);
+ err = -EIO;
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ } else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, cur + len);
+ attached = 0;
+ } else {
+ if (io->rq && (map->m_pa + ofs != io->dev.m_pa ||
+ map->m_deviceid != io->dev.m_deviceid)) {
+io_retry:
+ erofs_fileio_rq_submit(io->rq);
+ io->rq = NULL;
+ }
+
+ if (!io->rq) {
+ io->dev = (struct erofs_map_dev) {
+ .m_pa = io->map.m_pa + ofs,
+ .m_deviceid = io->map.m_deviceid,
+ };
+ err = erofs_map_dev(inode->i_sb, &io->dev);
+ if (err)
+ break;
+ io->rq = erofs_fileio_rq_alloc(&io->dev);
+ io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;
+ attached = 0;
+ }
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
+ if (!bio_add_folio(&io->rq->bio, folio, len, cur))
+ goto io_retry;
+ io->dev.m_pa += len;
+ }
+ cur += len;
+ }
+ erofs_onlinefolio_end(folio, err);
+ return err;
+}
+
+static int erofs_fileio_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fileio io = {};
+ int err;
+
+ trace_erofs_read_folio(folio, true);
+ err = erofs_fileio_scan_folio(&io, folio);
+ erofs_fileio_rq_submit(io.rq);
+ return err;
+}
+
+static void erofs_fileio_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct erofs_fileio io = {};
+ struct folio *folio;
+ int err;
+
+ trace_erofs_readpages(inode, readahead_index(rac),
+ readahead_count(rac), true);
+ while ((folio = readahead_folio(rac))) {
+ err = erofs_fileio_scan_folio(&io, folio);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
+ }
+ erofs_fileio_rq_submit(io.rq);
+}
+
+const struct address_space_operations erofs_fileio_aops = {
+ .read_folio = erofs_fileio_read_folio,
+ .readahead = erofs_fileio_readahead,
+};
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 89a7c2453aae..ce3d8737df85 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -3,6 +3,7 @@
* Copyright (C) 2022, Alibaba Cloud
* Copyright (C) 2022, Bytedance Inc. All rights reserved.
*/
+#include <linux/pseudo_fs.h>
#include <linux/fscache.h>
#include "internal.h"
@@ -12,9 +13,27 @@ static LIST_HEAD(erofs_domain_list);
static LIST_HEAD(erofs_domain_cookies_list);
static struct vfsmount *erofs_pseudo_mnt;
-struct erofs_fscache_request {
- struct erofs_fscache_request *primary;
- struct netfs_cache_resources cache_resources;
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type erofs_anon_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "pseudo_erofs",
+ .init_fs_context = erofs_anon_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct erofs_fscache_io {
+ struct netfs_cache_resources cres;
+ struct iov_iter iter;
+ netfs_io_terminated_t end_io;
+ void *private;
+ refcount_t ref;
+};
+
+struct erofs_fscache_rq {
struct address_space *mapping; /* The mapping being accessed */
loff_t start; /* Start position */
size_t len; /* Length of the request */
@@ -23,44 +42,17 @@ struct erofs_fscache_request {
refcount_t ref;
};
-static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
- loff_t start, size_t len)
+static bool erofs_fscache_io_put(struct erofs_fscache_io *io)
{
- struct erofs_fscache_request *req;
-
- req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->mapping = mapping;
- req->start = start;
- req->len = len;
- refcount_set(&req->ref, 1);
-
- return req;
+ if (!refcount_dec_and_test(&io->ref))
+ return false;
+ if (io->cres.ops)
+ io->cres.ops->end_operation(&io->cres);
+ kfree(io);
+ return true;
}
-static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
- size_t len)
-{
- struct erofs_fscache_request *req;
-
- /* use primary request for the first submission */
- if (!primary->submitted) {
- refcount_inc(&primary->ref);
- return primary;
- }
-
- req = erofs_fscache_req_alloc(primary->mapping,
- primary->start + primary->submitted, len);
- if (!IS_ERR(req)) {
- req->primary = primary;
- refcount_inc(&primary->ref);
- }
- return req;
-}
-
-static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
+static void erofs_fscache_req_complete(struct erofs_fscache_rq *req)
{
struct folio *folio;
bool failed = req->error;
@@ -80,120 +72,196 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
rcu_read_unlock();
}
-static void erofs_fscache_req_put(struct erofs_fscache_request *req)
+static void erofs_fscache_req_put(struct erofs_fscache_rq *req)
{
- if (refcount_dec_and_test(&req->ref)) {
- if (req->cache_resources.ops)
- req->cache_resources.ops->end_operation(&req->cache_resources);
- if (!req->primary)
- erofs_fscache_req_complete(req);
- else
- erofs_fscache_req_put(req->primary);
- kfree(req);
- }
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ erofs_fscache_req_complete(req);
+ kfree(req);
}
-static void erofs_fscache_subreq_complete(void *priv,
+static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
+ return req;
+}
+
+static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
+{
+ struct erofs_fscache_rq *req = io->private;
+
+ if (erofs_fscache_io_put(io))
+ erofs_fscache_req_put(req);
+}
+
+static void erofs_fscache_req_end_io(void *priv,
ssize_t transferred_or_error, bool was_async)
{
- struct erofs_fscache_request *req = priv;
+ struct erofs_fscache_io *io = priv;
+ struct erofs_fscache_rq *req = io->private;
- if (IS_ERR_VALUE(transferred_or_error)) {
- if (req->primary)
- req->primary->error = transferred_or_error;
- else
- req->error = transferred_or_error;
- }
- erofs_fscache_req_put(req);
+ if (IS_ERR_VALUE(transferred_or_error))
+ req->error = transferred_or_error;
+ erofs_fscache_req_io_put(io);
+}
+
+static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req)
+{
+ struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL);
+
+ if (!io)
+ return NULL;
+ io->end_io = erofs_fscache_req_end_io;
+ io->private = req;
+ refcount_inc(&req->ref);
+ refcount_set(&io->ref, 1);
+ return io;
}
/*
- * Read data from fscache (cookie, pstart, len), and fill the read data into
- * page cache described by (req->mapping, lstart, len). @pstart describeis the
- * start physical address in the cache file.
+ * Read data from fscache described by cookie at pstart physical address
+ * offset, and fill the read data into buffer described by io->iter.
*/
-static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct erofs_fscache_request *req, loff_t pstart, size_t len)
+static int erofs_fscache_read_io_async(struct fscache_cookie *cookie,
+ loff_t pstart, struct erofs_fscache_io *io)
{
enum netfs_io_source source;
- struct super_block *sb = req->mapping->host->i_sb;
- struct netfs_cache_resources *cres = &req->cache_resources;
- struct iov_iter iter;
- loff_t lstart = req->start + req->submitted;
- size_t done = 0;
+ struct netfs_cache_resources *cres = &io->cres;
+ struct iov_iter *iter = &io->iter;
int ret;
- DBG_BUGON(len > req->len - req->submitted);
-
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
return ret;
- while (done < len) {
- loff_t sstart = pstart + done;
- size_t slen = len - done;
+ while (iov_iter_count(iter)) {
+ size_t orig_count = iov_iter_count(iter), len = orig_count;
unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
source = cres->ops->prepare_ondemand_read(cres,
- sstart, &slen, LLONG_MAX, &flags, 0);
- if (WARN_ON(slen == 0))
+ pstart, &len, LLONG_MAX, &flags, 0);
+ if (WARN_ON(len == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ erofs_err(NULL, "prepare_ondemand_read failed (source %d)", source);
return -EIO;
}
- refcount_inc(&req->ref);
- iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
- lstart + done, slen);
-
- ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
- erofs_fscache_subreq_complete, req);
+ iov_iter_truncate(iter, len);
+ refcount_inc(&io->ref);
+ ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL,
+ io->end_io, io);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
- erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ erofs_err(NULL, "fscache_read failed (ret %d)", ret);
return ret;
}
+ if (WARN_ON(iov_iter_count(iter)))
+ return -EIO;
- done += slen;
+ iov_iter_reexpand(iter, orig_count - len);
+ pstart += len;
}
- DBG_BUGON(done != len);
return 0;
}
-static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+struct erofs_fscache_bio {
+ struct erofs_fscache_io io;
+ struct bio bio; /* w/o bdev to share bio_add_page/endio() */
+ struct bio_vec bvecs[BIO_MAX_VECS];
+};
+
+static void erofs_fscache_bio_endio(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct erofs_fscache_bio *io = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ io->bio.bi_status = errno_to_blk_status(transferred_or_error);
+ io->bio.bi_end_io(&io->bio);
+ BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0);
+ erofs_fscache_io_put(&io->io);
+}
+
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
{
+ struct erofs_fscache_bio *io;
+
+ io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
+ bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+ io->io.private = mdev->m_dif->fscache->cookie;
+ io->io.end_io = erofs_fscache_bio_endio;
+ refcount_set(&io->io.ref, 1);
+ return &io->bio;
+}
+
+void erofs_fscache_submit_bio(struct bio *bio)
+{
+ struct erofs_fscache_bio *io = container_of(bio,
+ struct erofs_fscache_bio, bio);
int ret;
+
+ iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt,
+ bio->bi_iter.bi_size);
+ ret = erofs_fscache_read_io_async(io->io.private,
+ bio->bi_iter.bi_sector << 9, &io->io);
+ erofs_fscache_io_put(&io->io);
+ if (!ret)
+ return;
+ bio->bi_status = errno_to_blk_status(ret);
+ bio->bi_end_io(bio);
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
struct erofs_fscache *ctx = folio->mapping->host->i_private;
- struct erofs_fscache_request *req;
+ int ret = -ENOMEM;
+ struct erofs_fscache_rq *req;
+ struct erofs_fscache_io *io;
req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return ret;
}
- ret = erofs_fscache_read_folios_async(ctx->cookie, req,
- folio_pos(folio), folio_size(folio));
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io) {
+ req->error = ret;
+ goto out;
+ }
+ iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages,
+ folio_pos(folio), folio_size(folio));
+
+ ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io);
if (ret)
req->error = ret;
+ erofs_fscache_req_io_put(io);
+out:
erofs_fscache_req_put(req);
return ret;
}
-static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
{
- struct address_space *mapping = primary->mapping;
+ struct address_space *mapping = req->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
- struct erofs_fscache_request *req;
+ struct erofs_fscache_io *io;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
- struct iov_iter iter;
- loff_t pos = primary->start + primary->submitted;
+ loff_t pos = req->start + req->submitted;
size_t count;
int ret;
@@ -204,35 +272,32 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (map.m_flags & EROFS_MAP_META) {
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- erofs_blk_t blknr;
- size_t offset, size;
+ struct iov_iter iter;
+ size_t size = map.m_llen;
void *src;
- /* For tail packing layout, the offset may be non-zero. */
- offset = erofs_blkoff(sb, map.m_pa);
- blknr = erofs_blknr(sb, map.m_pa);
- size = map.m_llen;
-
- src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
if (IS_ERR(src))
return PTR_ERR(src);
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
- if (copy_to_iter(src + offset, size, &iter) != size) {
+ if (copy_to_iter(src, size, &iter) != size) {
erofs_put_metabuf(&buf);
return -EFAULT;
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- primary->submitted += PAGE_SIZE;
+ req->submitted += PAGE_SIZE;
return 0;
}
- count = primary->len - primary->submitted;
+ count = req->len - req->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ struct iov_iter iter;
+
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- primary->submitted += count;
+ req->submitted += count;
return 0;
}
@@ -247,18 +312,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (ret)
return ret;
- req = erofs_fscache_req_chain(primary, count);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io)
+ return -ENOMEM;
+ iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
+ ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie,
+ mdev.m_pa + (pos - map.m_la), io);
+ erofs_fscache_req_io_put(io);
- ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- req, mdev.m_pa + (pos - map.m_la), count);
- erofs_fscache_req_put(req);
- primary->submitted += count;
+ req->submitted += count;
return ret;
}
-static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+static int erofs_fscache_data_read(struct erofs_fscache_rq *req)
{
int ret;
@@ -267,20 +333,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req)
if (ret)
req->error = ret;
} while (!ret && req->submitted < req->len);
-
return ret;
}
static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
int ret;
req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return -ENOMEM;
}
ret = erofs_fscache_data_read(req);
@@ -290,14 +355,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
if (!readahead_count(rac))
return;
req = erofs_fscache_req_alloc(rac->mapping,
readahead_pos(rac), readahead_length(rac));
- if (IS_ERR(req))
+ if (!req)
return;
/* The request completion will drop refs on the folios. */
@@ -381,7 +446,7 @@ static int erofs_fscache_init_domain(struct super_block *sb)
goto out;
if (!erofs_pseudo_mnt) {
- struct vfsmount *mnt = kern_mount(&erofs_fs_type);
+ struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto out;
@@ -592,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
if (IS_ERR(fscache))
return PTR_ERR(fscache);
- sbi->s_fscache = fscache;
+ sbi->dif0.fscache = fscache;
return 0;
}
@@ -600,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- erofs_fscache_unregister_cookie(sbi->s_fscache);
+ erofs_fscache_unregister_cookie(sbi->dif0.fscache);
if (sbi->domain)
erofs_fscache_domain_put(sbi->domain);
else
fscache_relinquish_volume(sbi->volume, NULL, false);
- sbi->s_fscache = NULL;
+ sbi->dif0.fscache = NULL;
sbi->volume = NULL;
sbi->domain = NULL;
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 36e638e8b53a..d4b89407822a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,11 +5,26 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
-
#include <trace/events/erofs.h>
-static void *erofs_read_inode(struct erofs_buf *buf,
- struct inode *inode, unsigned int *ofs)
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
+ unsigned int m_pofs)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ loff_t off;
+
+ m_pofs += vi->xattr_isize;
+ /* check if it cannot be handled with fast symlink scheme */
+ if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+ check_add_overflow(m_pofs, inode->i_size, &off) ||
+ off > i_blocksize(inode))
+ return 0;
+
+ inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL);
+ return inode->i_link ? 0 : -ENOMEM;
+}
+
+static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -20,20 +35,21 @@ static void *erofs_read_inode(struct erofs_buf *buf,
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
union erofs_inode_i_u iu;
- unsigned int ifmt;
- int err;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int ifmt, ofs;
+ int err = 0;
blkaddr = erofs_blknr(sb, inode_loc);
- *ofs = erofs_blkoff(sb, inode_loc);
+ ofs = erofs_blkoff(sb, inode_loc);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
vi->nid, PTR_ERR(kaddr));
- return kaddr;
+ return PTR_ERR(kaddr);
}
- dic = kaddr + *ofs;
+ dic = kaddr + ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
erofs_err(sb, "unsupported i_format %u of nid %llu",
@@ -54,11 +70,11 @@ static void *erofs_read_inode(struct erofs_buf *buf,
case EROFS_INODE_LAYOUT_EXTENDED:
vi->inode_isize = sizeof(struct erofs_inode_extended);
/* check if the extended inode acrosses block boundary */
- if (*ofs + vi->inode_isize <= sb->s_blocksize) {
- *ofs += vi->inode_isize;
+ if (ofs + vi->inode_isize <= sb->s_blocksize) {
+ ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
} else {
- const unsigned int gotten = sb->s_blocksize - *ofs;
+ const unsigned int gotten = sb->s_blocksize - ofs;
copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
@@ -66,16 +82,16 @@ static void *erofs_read_inode(struct erofs_buf *buf,
goto err_out;
}
memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1),
EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
vi->nid, PTR_ERR(kaddr));
kfree(copied);
- return kaddr;
+ return PTR_ERR(kaddr);
}
- *ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, *ofs);
+ ofs = vi->inode_isize - gotten;
+ memcpy((u8 *)copied + gotten, kaddr, ofs);
die = copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
@@ -91,11 +107,10 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_size = le64_to_cpu(die->i_size);
kfree(copied);
- copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
- *ofs += vi->inode_isize;
+ ofs += vi->inode_isize;
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
@@ -115,11 +130,21 @@ static void *erofs_read_inode(struct erofs_buf *buf,
goto err_out;
}
+ if (unlikely(inode->i_size < 0)) {
+ erofs_err(sb, "negative i_size @ nid %llu", vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ if(S_ISLNK(inode->i_mode)) {
+ err = erofs_fill_symlink(inode, kaddr, ofs);
+ if (err)
+ goto err_out;
+ }
break;
case S_IFCHR:
case S_IFBLK:
@@ -165,65 +190,23 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
else
inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
- return kaddr;
-
err_out:
- DBG_BUGON(1);
- kfree(copied);
- erofs_put_metabuf(buf);
- return ERR_PTR(err);
-}
-
-static int erofs_fill_symlink(struct inode *inode, void *kaddr,
- unsigned int m_pofs)
-{
- struct erofs_inode *vi = EROFS_I(inode);
- unsigned int bsz = i_blocksize(inode);
- char *lnk;
-
- /* if it cannot be handled with fast symlink scheme */
- if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= bsz || inode->i_size < 0) {
- inode->i_op = &erofs_symlink_iops;
- return 0;
- }
-
- lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
- if (!lnk)
- return -ENOMEM;
-
- m_pofs += vi->xattr_isize;
- /* inline symlink data shouldn't cross block boundary */
- if (m_pofs + inode->i_size > bsz) {
- kfree(lnk);
- erofs_err(inode->i_sb,
- "inline data cross block boundary @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- memcpy(lnk, kaddr + m_pofs, inode->i_size);
- lnk[inode->i_size] = '\0';
-
- inode->i_link = lnk;
- inode->i_op = &erofs_fast_symlink_iops;
- return 0;
+ DBG_BUGON(err);
+ erofs_put_metabuf(&buf);
+ return err;
}
static int erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
- unsigned int ofs;
- int err = 0;
+ int err;
trace_erofs_fill_inode(inode);
/* read inode base data from disk */
- kaddr = erofs_read_inode(&buf, inode, &ofs);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
+ err = erofs_read_inode(inode);
+ if (err)
+ return err;
/* setup the new inode */
switch (inode->i_mode & S_IFMT) {
@@ -240,9 +223,10 @@ static int erofs_fill_inode(struct inode *inode)
inode_nohighmem(inode);
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, kaddr, ofs);
- if (err)
- goto out_unlock;
+ if (inode->i_link)
+ inode->i_op = &erofs_fast_symlink_iops;
+ else
+ inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
case S_IFCHR:
@@ -251,35 +235,33 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFSOCK:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
- goto out_unlock;
+ return 0;
default:
- err = -EFSCORRUPTED;
- goto out_unlock;
+ return -EFSCORRUPTED;
}
+ mapping_set_large_folios(inode->i_mapping);
if (erofs_inode_is_data_compressed(vi->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
- if (!erofs_is_fscache_mode(inode->i_sb)) {
- DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE,
- erofs_info, inode->i_sb,
- "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
- inode->i_mapping->a_ops = &z_erofs_aops;
- err = 0;
- goto out_unlock;
- }
-#endif
+ DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
+ erofs_info, inode->i_sb,
+ "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
+ inode->i_mapping->a_ops = &z_erofs_aops;
+#else
err = -EOPNOTSUPP;
- goto out_unlock;
- }
- inode->i_mapping->a_ops = &erofs_raw_access_aops;
- mapping_set_large_folios(inode->i_mapping);
+#endif
+ } else {
+ inode->i_mapping->a_ops = &erofs_aops;
#ifdef CONFIG_EROFS_FS_ONDEMAND
- if (erofs_is_fscache_mode(inode->i_sb))
- inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb)))
+ inode->i_mapping->a_ops = &erofs_fileio_aops;
#endif
+ }
-out_unlock:
- erofs_put_metabuf(&buf);
return err;
}
@@ -336,14 +318,29 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ bool compressed =
+ erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
- if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+ if (compressed)
stat->attributes |= STATX_ATTR_COMPRESSED;
-
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
+ /*
+ * Return the DIO alignment restrictions if requested.
+ *
+ * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode
+ * and uncompressed inodes, otherwise we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN;
+ if (bdev && !compressed) {
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ }
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b0409badb017..686d835eb533 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -20,18 +20,12 @@
#include <linux/iomap.h>
#include "erofs_fs.h"
-/* redefine pr_fmt "erofs: " */
-#undef pr_fmt
-#define pr_fmt(fmt) "erofs: " fmt
-
-__printf(3, 4) void _erofs_err(struct super_block *sb,
- const char *function, const char *fmt, ...);
+__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
#define erofs_err(sb, fmt, ...) \
- _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
-__printf(3, 4) void _erofs_info(struct super_block *sb,
- const char *function, const char *fmt, ...);
+ _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__)
#define erofs_info(sb, fmt, ...) \
- _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+ _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__)
+
#ifdef CONFIG_EROFS_FS_DEBUG
#define DBG_BUGON BUG_ON
#else
@@ -49,7 +43,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct bdev_handle *bdev_handle;
+ struct file *file;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -64,15 +58,12 @@ enum {
};
struct erofs_mount_opts {
-#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
/* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
unsigned int sync_decompress;
-
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
-#endif
unsigned int mount_opt;
};
@@ -84,13 +75,6 @@ struct erofs_dev_context {
bool flatdev;
};
-struct erofs_fs_context {
- struct erofs_mount_opts opt;
- struct erofs_dev_context *devs;
- char *fsid;
- char *domain_id;
-};
-
/* all filesystem-wide lz4 configurations */
struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
@@ -123,6 +107,7 @@ struct erofs_xattr_prefix_item {
};
struct erofs_sb_info {
+ struct erofs_device_info dif0;
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
@@ -142,10 +127,7 @@ struct erofs_sb_info {
#endif /* CONFIG_EROFS_FS_ZIP */
struct inode *packed_inode;
struct erofs_dev_context *devs;
- struct dax_device *dax_dev;
- u64 dax_part_off;
u64 total_blocks;
- u32 primarydevice_blocks;
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
@@ -181,7 +163,6 @@ struct erofs_sb_info {
/* fscache support */
struct fscache_volume *volume;
- struct erofs_fscache *s_fscache;
struct erofs_domain *domain;
char *fsid;
char *domain_id;
@@ -195,14 +176,21 @@ struct erofs_sb_info {
#define EROFS_MOUNT_POSIX_ACL 0x00000020
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
+#define EROFS_MOUNT_DIRECT_IO 0x00000100
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file;
+}
+
static inline bool erofs_is_fscache_mode(struct super_block *sb)
{
- return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) &&
+ !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev;
}
enum {
@@ -211,26 +199,20 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-/* basic unit of the workstation of a super_block */
-struct erofs_workgroup {
- pgoff_t index;
- struct lockref lockref;
-};
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
};
struct erofs_buf {
- struct inode *inode;
+ struct address_space *mapping;
+ struct file *file;
struct page *page;
void *base;
- enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits)
+#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
@@ -322,17 +304,13 @@ static inline unsigned int erofs_inode_datalayout(unsigned int ifmt)
return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK;
}
-/*
- * Different from grab_cache_page_nowait(), reclaiming is never triggered
- * when allocating new pages.
- */
-static inline
-struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
- pgoff_t index)
+/* reclaiming is never triggered when allocating new folios. */
+static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
+ pgoff_t index)
{
- return pagecache_get_page(mapping, index,
+ return __filemap_get_folio(as, index,
FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
- readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+ readahead_gfp_mask(as) & ~__GFP_RECLAIM);
}
/* Has a disk mapping */
@@ -376,19 +354,18 @@ enum {
};
struct erofs_map_dev {
- struct erofs_fscache *m_fscache;
+ struct super_block *m_sb;
+ struct erofs_device_info *m_dif;
struct block_device *m_bdev;
- struct dax_device *m_daxdev;
- u64 m_dax_part_off;
erofs_off_t m_pa;
unsigned int m_deviceid;
};
-extern struct file_system_type erofs_fs_type;
extern const struct super_operations erofs_sops;
-extern const struct address_space_operations erofs_raw_access_aops;
+extern const struct address_space_operations erofs_aops;
+extern const struct address_space_operations erofs_fileio_aops;
extern const struct address_space_operations z_erofs_aops;
extern const struct address_space_operations erofs_fscache_access_aops;
@@ -410,15 +387,18 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr,
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
enum erofs_kmap_type type);
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type);
+ erofs_off_t offset, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
+void erofs_onlinefolio_init(struct folio *folio);
+void erofs_onlinefolio_split(struct folio *folio);
+void erofs_onlinefolio_end(struct folio *folio, int err);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
@@ -446,7 +426,11 @@ void erofs_unregister_sysfs(struct super_block *sb);
int __init erofs_init_sysfs(void);
void erofs_exit_sysfs(void);
-struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv);
+static inline struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
+{
+ return __erofs_allocpage(pagepool, gfp, false);
+}
static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
{
set_page_private(page, (unsigned long)*pagepool);
@@ -455,27 +439,24 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-void erofs_workgroup_put(struct erofs_workgroup *grp);
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index);
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+
+extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
-int __init z_erofs_init_zip_subsystem(void);
-void z_erofs_exit_zip_subsystem(void);
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
+int __init z_erofs_init_subsystem(void);
+void z_erofs_exit_subsystem(void);
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
-void *erofs_get_pcpubuf(unsigned int requiredpages);
-void erofs_put_pcpubuf(void *ptr);
-int erofs_pcpubuf_growsize(unsigned int nrpages);
-void __init erofs_pcpubuf_init(void);
-void erofs_pcpubuf_exit(void);
+void *z_erofs_get_gbuf(unsigned int requiredpages);
+void z_erofs_put_gbuf(void *ptr);
+int z_erofs_gbuf_growsize(unsigned int nrpages);
+int __init z_erofs_gbuf_init(void);
+void z_erofs_gbuf_exit(void);
int erofs_init_managed_cache(struct super_block *sb);
int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
@@ -483,28 +464,18 @@ static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
-static inline int z_erofs_init_zip_subsystem(void) { return 0; }
-static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline void erofs_pcpubuf_init(void) {}
-static inline void erofs_pcpubuf_exit(void) {}
+static inline int z_erofs_init_subsystem(void) { return 0; }
+static inline void z_erofs_exit_subsystem(void) {}
static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
-#ifdef CONFIG_EROFS_FS_ZIP_LZMA
-int __init z_erofs_lzma_init(void);
-void z_erofs_lzma_exit(void);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fileio_submit_bio(struct bio *bio);
#else
-static inline int z_erofs_lzma_init(void) { return 0; }
-static inline int z_erofs_lzma_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
-
-#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
-int __init z_erofs_deflate_init(void);
-void z_erofs_deflate_exit(void);
-#else
-static inline int z_erofs_deflate_init(void) { return 0; }
-static inline int z_erofs_deflate_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
+static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fileio_submit_bio(struct bio *bio) {}
+#endif
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
@@ -513,6 +484,8 @@ void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
char *name, unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fscache_submit_bio(struct bio *bio);
#else
static inline int erofs_fscache_register_fs(struct super_block *sb)
{
@@ -530,6 +503,8 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
{
}
+static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index f0110a78acb2..c94d0c1608a8 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -99,8 +99,8 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
- buf.inode = dir;
- de = erofs_bread(&buf, mid, EROFS_KMAP);
+ buf.mapping = dir->i_mapping;
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
@@ -171,7 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
qn.name = name->name;
qn.end = name->name + name->len;
- buf.inode = dir;
+ buf.mapping = dir->i_mapping;
ndirents = 0;
de = erofs_find_target_block(&buf, dir, &qn, &ndirents);
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
deleted file mode 100644
index c7a4b1d77069..000000000000
--- a/fs/erofs/pcpubuf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) Gao Xiang <xiang@kernel.org>
- *
- * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
- * per-CPU virtual memory (in pages) in advance to store such inplace I/O
- * data if inplace decompression is failed (due to unmet inplace margin for
- * example).
- */
-#include "internal.h"
-
-struct erofs_pcpubuf {
- raw_spinlock_t lock;
- void *ptr;
- struct page **pages;
- unsigned int nrpages;
-};
-
-static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
-
-void *erofs_get_pcpubuf(unsigned int requiredpages)
- __acquires(pcb->lock)
-{
- struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
-
- raw_spin_lock(&pcb->lock);
- /* check if the per-CPU buffer is too small */
- if (requiredpages > pcb->nrpages) {
- raw_spin_unlock(&pcb->lock);
- put_cpu_var(erofs_pcb);
- /* (for sparse checker) pretend pcb->lock is still taken */
- __acquire(pcb->lock);
- return NULL;
- }
- return pcb->ptr;
-}
-
-void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
-{
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
-
- DBG_BUGON(pcb->ptr != ptr);
- raw_spin_unlock(&pcb->lock);
- put_cpu_var(erofs_pcb);
-}
-
-/* the next step: support per-CPU page buffers hotplug */
-int erofs_pcpubuf_growsize(unsigned int nrpages)
-{
- static DEFINE_MUTEX(pcb_resize_mutex);
- static unsigned int pcb_nrpages;
- struct page *pagepool = NULL;
- int delta, cpu, ret, i;
-
- mutex_lock(&pcb_resize_mutex);
- delta = nrpages - pcb_nrpages;
- ret = 0;
- /* avoid shrinking pcpubuf, since no idea how many fses rely on */
- if (delta <= 0)
- goto out;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
- struct page **pages, **oldpages;
- void *ptr, *old_ptr;
-
- pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- break;
- }
-
- for (i = 0; i < nrpages; ++i) {
- pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
- if (!pages[i]) {
- ret = -ENOMEM;
- oldpages = pages;
- goto free_pagearray;
- }
- }
- ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
- if (!ptr) {
- ret = -ENOMEM;
- oldpages = pages;
- goto free_pagearray;
- }
- raw_spin_lock(&pcb->lock);
- old_ptr = pcb->ptr;
- pcb->ptr = ptr;
- oldpages = pcb->pages;
- pcb->pages = pages;
- i = pcb->nrpages;
- pcb->nrpages = nrpages;
- raw_spin_unlock(&pcb->lock);
-
- if (!oldpages) {
- DBG_BUGON(old_ptr);
- continue;
- }
-
- if (old_ptr)
- vunmap(old_ptr);
-free_pagearray:
- while (i)
- erofs_pagepool_add(&pagepool, oldpages[--i]);
- kfree(oldpages);
- if (ret)
- break;
- }
- pcb_nrpages = nrpages;
- erofs_release_pages(&pagepool);
-out:
- mutex_unlock(&pcb_resize_mutex);
- return ret;
-}
-
-void __init erofs_pcpubuf_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
- raw_spin_lock_init(&pcb->lock);
- }
-}
-
-void erofs_pcpubuf_exit(void)
-{
- int cpu, i;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
- if (pcb->ptr) {
- vunmap(pcb->ptr);
- pcb->ptr = NULL;
- }
- if (!pcb->pages)
- continue;
-
- for (i = 0; i < pcb->nrpages; ++i)
- if (pcb->pages[i])
- put_page(pcb->pages[i]);
- kfree(pcb->pages);
- pcb->pages = NULL;
- }
-}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 5f60f163bd56..827b62665649 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -10,6 +10,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/exportfs.h>
+#include <linux/backing-dev.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -17,65 +18,42 @@
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
+void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
-
if (sb)
- pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ printk("%c%cerofs (device %s): %pV",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
- pr_err("%s: %pV", func, &vaf);
- va_end(args);
-}
-
-void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (sb)
- pr_info("(device %s): %pV", sb->s_id, &vaf);
- else
- pr_info("%pV", &vaf);
+ printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
{
- size_t len = 1 << EROFS_SB(sb)->blkszbits;
- struct erofs_super_block *dsb;
- u32 expected_crc, crc;
+ struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET;
+ u32 len = 1 << EROFS_SB(sb)->blkszbits, crc;
if (len > EROFS_SUPER_OFFSET)
len -= EROFS_SUPER_OFFSET;
+ len -= offsetof(struct erofs_super_block, checksum) +
+ sizeof(dsb->checksum);
- dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL);
- if (!dsb)
- return -ENOMEM;
-
- expected_crc = le32_to_cpu(dsb->checksum);
- dsb->checksum = 0;
- /* to allow for x86 boot sectors and other oddities. */
- crc = crc32c(~0, dsb, len);
- kfree(dsb);
-
- if (crc != expected_crc) {
- erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
- crc, expected_crc);
- return -EBADMSG;
- }
- return 0;
+ /* skip .magic(pre-verified) and .checksum(0) fields */
+ crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len);
+ if (crc == le32_to_cpu(dsb->checksum))
+ return 0;
+ erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+ crc, le32_to_cpu(dsb->checksum));
+ return -EBADMSG;
}
static void erofs_inode_init_once(void *ptr)
@@ -108,22 +86,6 @@ static void erofs_free_inode(struct inode *inode)
kmem_cache_free(erofs_inode_cachep, vi);
}
-static bool check_layout_compatibility(struct super_block *sb,
- struct erofs_super_block *dsb)
-{
- const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
-
- EROFS_SB(sb)->feature_incompat = feature;
-
- /* check if current kernel meets all mandatory requirements */
- if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
- erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
- feature & ~EROFS_ALL_FEATURE_INCOMPAT);
- return false;
- }
- return true;
-}
-
/* read variable-sized metadata, offset will be aligned by 4-byte */
void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp)
@@ -132,11 +94,11 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr))
return ptr;
- len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]);
+ len = le16_to_cpu(*(__le16 *)ptr);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
@@ -148,12 +110,12 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, EROFS_KMAP);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
}
- memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt);
+ memcpy(buffer + i, ptr, cnt);
*offset += cnt;
}
return buffer;
@@ -177,13 +139,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct bdev_handle *bdev_handle;
- void *ptr;
+ struct file *file;
- ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
- dis = ptr + erofs_blkoff(sb, *pos);
+ dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
+ if (IS_ERR(dis))
+ return PTR_ERR(dis);
if (!sbi->devs->flatdev && !dif->path) {
if (!dis->tag[0]) {
@@ -201,13 +161,21 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
- sb->s_type, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
- dif->bdev_handle = bdev_handle;
- dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
- &dif->dax_part_off, NULL, NULL);
+ file = erofs_is_fileio_mode(sbi) ?
+ filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
+ bdev_file_open_by_path(dif->path,
+ BLK_OPEN_READ, sb->s_type, NULL);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (!erofs_is_fileio_mode(sbi)) {
+ dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
+ &dif->dax_part_off, NULL, NULL);
+ } else if (!S_ISREG(file_inode(file)->i_mode)) {
+ fput(file);
+ return -EINVAL;
+ }
+ dif->file = file;
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -227,7 +195,7 @@ static int erofs_scan_devices(struct super_block *sb,
struct erofs_device_info *dif;
int id, err = 0;
- sbi->total_blocks = sbi->primarydevice_blocks;
+ sbi->total_blocks = sbi->dif0.blocks;
if (!erofs_sb_has_device_table(sbi))
ondisk_extradevs = 0;
else
@@ -281,7 +249,7 @@ static int erofs_scan_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
- struct erofs_sb_info *sbi;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
void *data;
@@ -293,9 +261,7 @@ static int erofs_read_superblock(struct super_block *sb)
return PTR_ERR(data);
}
- sbi = EROFS_SB(sb);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
-
ret = -EINVAL;
if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
erofs_err(sb, "cannot find valid erofs superblock");
@@ -320,8 +286,12 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = -EINVAL;
- if (!check_layout_compatibility(sb, dsb))
+ sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat);
+ if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) {
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
+ sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT);
goto out;
+ }
sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
@@ -329,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+ sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -345,7 +315,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->build_time = le64_to_cpu(dsb->build_time);
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
- memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+ super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
ret = strscpy(sbi->volume_name, dsb->volume_name,
sizeof(dsb->volume_name));
@@ -364,36 +334,30 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_scan_devices(sb, dsb);
if (erofs_is_fscache_mode(sb))
- erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+ erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
}
-static void erofs_default_options(struct erofs_fs_context *ctx)
+static void erofs_default_options(struct erofs_sb_info *sbi)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->opt.max_sync_decompress_pages = 3;
- ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
+ sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ sbi->opt.max_sync_decompress_pages = 3;
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(&ctx->opt, XATTR_USER);
+ set_opt(&sbi->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(&ctx->opt, POSIX_ACL);
+ set_opt(&sbi->opt, POSIX_ACL);
#endif
}
enum {
- Opt_user_xattr,
- Opt_acl,
- Opt_cache_strategy,
- Opt_dax,
- Opt_dax_enum,
- Opt_device,
- Opt_fsid,
- Opt_domain_id,
+ Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
+ Opt_device, Opt_fsid, Opt_domain_id, Opt_directio,
Opt_err
};
@@ -420,23 +384,23 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("device", Opt_device),
fsparam_string("fsid", Opt_fsid),
fsparam_string("domain_id", Opt_domain_id),
+ fsparam_flag_no("directio", Opt_directio),
{}
};
static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
{
#ifdef CONFIG_FS_DAX
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
- warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(&ctx->opt, DAX_ALWAYS);
- clear_opt(&ctx->opt, DAX_NEVER);
+ set_opt(&sbi->opt, DAX_ALWAYS);
+ clear_opt(&sbi->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(&ctx->opt, DAX_NEVER);
- clear_opt(&ctx->opt, DAX_ALWAYS);
+ set_opt(&sbi->opt, DAX_NEVER);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -451,7 +415,7 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
struct fs_parse_result result;
struct erofs_device_info *dif;
int opt, ret;
@@ -464,9 +428,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(&ctx->opt, XATTR_USER);
+ set_opt(&sbi->opt, XATTR_USER);
else
- clear_opt(&ctx->opt, XATTR_USER);
+ clear_opt(&sbi->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -474,16 +438,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(&ctx->opt, POSIX_ACL);
+ set_opt(&sbi->opt, POSIX_ACL);
else
- clear_opt(&ctx->opt, POSIX_ACL);
+ clear_opt(&sbi->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->opt.cache_strategy = result.uint_32;
+ sbi->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -505,27 +469,27 @@ static int erofs_fc_parse_param(struct fs_context *fc,
kfree(dif);
return -ENOMEM;
}
- down_write(&ctx->devs->rwsem);
- ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
- up_write(&ctx->devs->rwsem);
+ down_write(&sbi->devs->rwsem);
+ ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&sbi->devs->rwsem);
if (ret < 0) {
kfree(dif->path);
kfree(dif);
return ret;
}
- ++ctx->devs->extra_devices;
+ ++sbi->devs->extra_devices;
break;
#ifdef CONFIG_EROFS_FS_ONDEMAND
case Opt_fsid:
- kfree(ctx->fsid);
- ctx->fsid = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->fsid)
+ kfree(sbi->fsid);
+ sbi->fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!sbi->fsid)
return -ENOMEM;
break;
case Opt_domain_id:
- kfree(ctx->domain_id);
- ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->domain_id)
+ kfree(sbi->domain_id);
+ sbi->domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!sbi->domain_id)
return -ENOMEM;
break;
#else
@@ -534,8 +498,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
break;
#endif
- default:
- return -ENOPARAM;
+ case Opt_directio:
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (result.boolean)
+ set_opt(&sbi->opt, DIRECT_IO);
+ else
+ clear_opt(&sbi->opt, DIRECT_IO);
+#else
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+ break;
}
return 0;
}
@@ -579,18 +551,26 @@ static const struct export_operations erofs_export_ops = {
.get_parent = erofs_get_parent,
};
-static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+static void erofs_set_sysfs_name(struct super_block *sb)
{
- static const struct tree_descr empty_descr = {""};
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+ if (sbi->domain_id)
+ super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id,
+ sbi->fsid);
+ else if (sbi->fsid)
+ super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
+ else if (erofs_is_fileio_mode(sbi))
+ super_set_sysfs_name_generic(sb, "%s",
+ bdi_dev_name(sb->s_bdi));
+ else
+ super_set_sysfs_name_id(sb);
}
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
- struct erofs_sb_info *sbi;
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
@@ -598,28 +578,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &erofs_sops;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sb->s_fs_info = sbi;
- sbi->opt = ctx->opt;
- sbi->devs = ctx->devs;
- ctx->devs = NULL;
- sbi->fsid = ctx->fsid;
- ctx->fsid = NULL;
- sbi->domain_id = ctx->domain_id;
- ctx->domain_id = NULL;
-
sbi->blkszbits = PAGE_SHIFT;
- if (erofs_is_fscache_mode(sb)) {
+ if (!sb->s_bdev) {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- err = erofs_fscache_register_fs(sb);
- if (err)
- return err;
-
+ if (erofs_is_fscache_mode(sb)) {
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+ }
err = super_setup_bdi(sb);
if (err)
return err;
@@ -629,9 +597,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -EINVAL;
}
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
- &sbi->dax_part_off,
- NULL, NULL);
+ sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dif0.dax_part_off, NULL, NULL);
}
err = erofs_read_superblock(sb);
@@ -643,14 +610,18 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
errorfc(fc, "unsupported blksize for fscache mode");
return -EINVAL;
}
- if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_blocksize = 1 << sbi->blkszbits;
+ sb->s_blocksize_bits = sbi->blkszbits;
+ } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
errorfc(fc, "failed to set erofs blksize");
return -EINVAL;
}
}
if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- if (!sbi->dax_dev) {
+ if (!sbi->dif0.dax_dev) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
clear_opt(&sbi->opt, DAX_ALWAYS);
} else if (sbi->blkszbits != PAGE_SHIFT) {
@@ -704,6 +675,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ erofs_set_sysfs_name(sb);
err = erofs_register_sysfs(sb);
if (err)
return err;
@@ -712,38 +684,53 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return 0;
}
-static int erofs_fc_anon_get_tree(struct fs_context *fc)
-{
- return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
-}
-
static int erofs_fc_get_tree(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
+ int ret;
- if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid)
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- return get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
+ IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
+ GET_TREE_BDEV_QUIET_LOOKUP : 0);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (ret == -ENOTBLK) {
+ struct file *file;
+
+ if (!fc->source)
+ return invalf(fc, "No source specified");
+ file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ sbi->dif0.file = file;
+
+ if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) &&
+ sbi->dif0.file->f_mapping->a_ops->read_folio)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+ }
+#endif
+ return ret;
}
static int erofs_fc_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *new_sbi = fc->s_fs_info;
DBG_BUGON(!sb_rdonly(sb));
- if (ctx->fsid || ctx->domain_id)
+ if (new_sbi->fsid || new_sbi->domain_id)
erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");
- if (test_opt(&ctx->opt, POSIX_ACL))
+ if (test_opt(&new_sbi->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->opt = ctx->opt;
+ sbi->opt = new_sbi->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
@@ -754,8 +741,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev_handle)
- bdev_release(dif->bdev_handle);
+ if (dif->file)
+ fput(dif->file);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
@@ -772,14 +759,22 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs)
kfree(devs);
}
+static void erofs_sb_free(struct erofs_sb_info *sbi)
+{
+ erofs_free_dev_context(sbi->devs);
+ kfree(sbi->fsid);
+ kfree(sbi->domain_id);
+ if (sbi->dif0.file)
+ fput(sbi->dif0.file);
+ kfree(sbi);
+}
+
static void erofs_fc_free(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
- erofs_free_dev_context(ctx->devs);
- kfree(ctx->fsid);
- kfree(ctx->domain_id);
- kfree(ctx);
+ if (sbi) /* free here if an error occurs before transferring to sb */
+ erofs_sb_free(sbi);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -789,62 +784,40 @@ static const struct fs_context_operations erofs_context_ops = {
.free = erofs_fc_free,
};
-static const struct fs_context_operations erofs_anon_context_ops = {
- .get_tree = erofs_fc_anon_get_tree,
-};
-
static int erofs_init_fs_context(struct fs_context *fc)
{
- struct erofs_fs_context *ctx;
-
- /* pseudo mount for anon inodes */
- if (fc->sb_flags & SB_KERNMOUNT) {
- fc->ops = &erofs_anon_context_ops;
- return 0;
- }
+ struct erofs_sb_info *sbi;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
return -ENOMEM;
- ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
- if (!ctx->devs) {
- kfree(ctx);
+
+ sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!sbi->devs) {
+ kfree(sbi);
return -ENOMEM;
}
- fc->fs_private = ctx;
+ fc->s_fs_info = sbi;
- idr_init(&ctx->devs->tree);
- init_rwsem(&ctx->devs->rwsem);
- erofs_default_options(ctx);
+ idr_init(&sbi->devs->tree);
+ init_rwsem(&sbi->devs->rwsem);
+ erofs_default_options(sbi);
fc->ops = &erofs_context_ops;
return 0;
}
static void erofs_kill_sb(struct super_block *sb)
{
- struct erofs_sb_info *sbi;
-
- /* pseudo mount for anon inodes */
- if (sb->s_flags & SB_KERNMOUNT) {
- kill_anon_super(sb);
- return;
- }
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fscache_mode(sb))
+ if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) ||
+ sbi->dif0.file)
kill_anon_super(sb);
else
kill_block_super(sb);
-
- sbi = EROFS_SB(sb);
- if (!sbi)
- return;
-
- erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev, NULL);
+ fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
- kfree(sbi->fsid);
- kfree(sbi->domain_id);
- kfree(sbi);
+ erofs_sb_free(sbi);
sb->s_fs_info = NULL;
}
@@ -868,7 +841,7 @@ static void erofs_put_super(struct super_block *sb)
erofs_fscache_unregister_fs(sb);
}
-struct file_system_type erofs_fs_type = {
+static struct file_system_type erofs_fs_type = {
.owner = THIS_MODULE,
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
@@ -885,7 +858,7 @@ static int __init erofs_module_init(void)
erofs_inode_cachep = kmem_cache_create("erofs_inode",
sizeof(struct erofs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
erofs_inode_init_once);
if (!erofs_inode_cachep)
return -ENOMEM;
@@ -894,16 +867,7 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
- err = z_erofs_lzma_init();
- if (err)
- goto lzma_err;
-
- err = z_erofs_deflate_init();
- if (err)
- goto deflate_err;
-
- erofs_pcpubuf_init();
- err = z_erofs_init_zip_subsystem();
+ err = z_erofs_init_subsystem();
if (err)
goto zip_err;
@@ -920,12 +884,8 @@ static int __init erofs_module_init(void)
fs_err:
erofs_exit_sysfs();
sysfs_err:
- z_erofs_exit_zip_subsystem();
+ z_erofs_exit_subsystem();
zip_err:
- z_erofs_deflate_exit();
-deflate_err:
- z_erofs_lzma_exit();
-lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -940,34 +900,29 @@ static void __exit erofs_module_exit(void)
rcu_barrier();
erofs_exit_sysfs();
- z_erofs_exit_zip_subsystem();
- z_erofs_deflate_exit();
- z_erofs_lzma_exit();
+ z_erofs_exit_subsystem();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
- erofs_pcpubuf_exit();
}
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = 0;
-
- if (!erofs_is_fscache_mode(sb))
- id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
-
buf->f_files = ULLONG_MAX;
buf->f_ffree = ULLONG_MAX - sbi->inos;
-
buf->f_namelen = EROFS_NAME_LEN;
- buf->f_fsid = u64_to_fsid(id);
+ if (uuid_is_null(&sb->s_uuid))
+ buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 :
+ huge_encode_dev(sb->s_bdev->bd_dev));
+ else
+ buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
return 0;
}
@@ -976,30 +931,20 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
struct erofs_mount_opts *opt = &sbi->opt;
-#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(opt, XATTR_USER))
- seq_puts(seq, ",user_xattr");
- else
- seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(opt, POSIX_ACL))
- seq_puts(seq, ",acl");
- else
- seq_puts(seq, ",noacl");
-#endif
-#ifdef CONFIG_EROFS_FS_ZIP
- if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
- seq_puts(seq, ",cache_strategy=disabled");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
- seq_puts(seq, ",cache_strategy=readahead");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
- seq_puts(seq, ",cache_strategy=readaround");
-#endif
+ if (IS_ENABLED(CONFIG_EROFS_FS_XATTR))
+ seq_puts(seq, test_opt(opt, XATTR_USER) ?
+ ",user_xattr" : ",nouser_xattr");
+ if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
+ seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl");
+ if (IS_ENABLED(CONFIG_EROFS_FS_ZIP))
+ seq_printf(seq, ",cache_strategy=%s",
+ erofs_param_cache_strategy[opt->cache_strategy].name);
if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+ if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO))
+ seq_puts(seq, ",directio");
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (sbi->fsid)
seq_printf(seq, ",fsid=%s", sbi->fsid);
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 435e515c0792..19d586273b70 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -10,6 +10,7 @@
enum {
attr_feature,
+ attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
};
@@ -57,11 +58,13 @@ static struct erofs_attr erofs_attr_##_name = { \
#ifdef CONFIG_EROFS_FS_ZIP
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
static struct attribute *erofs_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
+ ATTR_LIST(drop_caches),
#endif
NULL,
};
@@ -163,6 +166,20 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
return -EINVAL;
*(bool *)ptr = !!t;
return len;
+#ifdef CONFIG_EROFS_FS_ZIP
+ case attr_drop_caches:
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t < 1 || t > 3)
+ return -EINVAL;
+
+ if (t & 2)
+ z_erofs_shrink_scan(sbi, ~0UL);
+ if (t & 1)
+ invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
+ return len;
+#endif
}
return 0;
}
@@ -205,34 +222,16 @@ static struct kobject erofs_feat = {
int erofs_register_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- char *name;
- char *str = NULL;
int err;
- if (erofs_is_fscache_mode(sb)) {
- if (sbi->domain_id) {
- str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id,
- sbi->fsid);
- if (!str)
- return -ENOMEM;
- name = str;
- } else {
- name = sbi->fsid;
- }
- } else {
- name = sb->s_id;
- }
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
- kfree(str);
- if (err)
- goto put_sb_kobj;
- return 0;
-
-put_sb_kobj:
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
+ sb->s_sysfs_name);
+ if (err) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
return err;
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
deleted file mode 100644
index e146d09151af..000000000000
--- a/fs/erofs/utils.c
+++ /dev/null
@@ -1,287 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * https://www.huawei.com/
- */
-#include "internal.h"
-
-struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
-{
- struct page *page = *pagepool;
-
- if (page) {
- DBG_BUGON(page_ref_count(page) != 1);
- *pagepool = (struct page *)page_private(page);
- } else {
- page = alloc_page(gfp);
- }
- return page;
-}
-
-void erofs_release_pages(struct page **pagepool)
-{
- while (*pagepool) {
- struct page *page = *pagepool;
-
- *pagepool = (struct page *)page_private(page);
- put_page(page);
- }
-}
-
-#ifdef CONFIG_EROFS_FS_ZIP
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
-static bool erofs_workgroup_get(struct erofs_workgroup *grp)
-{
- if (lockref_get_not_zero(&grp->lockref))
- return true;
-
- spin_lock(&grp->lockref.lock);
- if (__lockref_is_dead(&grp->lockref)) {
- spin_unlock(&grp->lockref.lock);
- return false;
- }
-
- if (!grp->lockref.count++)
- atomic_long_dec(&erofs_global_shrink_cnt);
- spin_unlock(&grp->lockref.lock);
- return true;
-}
-
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_workgroup *grp;
-
-repeat:
- rcu_read_lock();
- grp = xa_load(&sbi->managed_pslots, index);
- if (grp) {
- if (!erofs_workgroup_get(grp)) {
- /* prefer to relax rcu read side */
- rcu_read_unlock();
- goto repeat;
- }
-
- DBG_BUGON(index != grp->index);
- }
- rcu_read_unlock();
- return grp;
-}
-
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct erofs_workgroup *pre;
-
- DBG_BUGON(grp->lockref.count < 1);
-repeat:
- xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_KERNEL);
- if (pre) {
- if (xa_is_err(pre)) {
- pre = ERR_PTR(xa_err(pre));
- } else if (!erofs_workgroup_get(pre)) {
- /* try to legitimize the current in-tree one */
- xa_unlock(&sbi->managed_pslots);
- cond_resched();
- goto repeat;
- }
- grp = pre;
- }
- xa_unlock(&sbi->managed_pslots);
- return grp;
-}
-
-static void __erofs_workgroup_free(struct erofs_workgroup *grp)
-{
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
-}
-
-void erofs_workgroup_put(struct erofs_workgroup *grp)
-{
- if (lockref_put_or_lock(&grp->lockref))
- return;
-
- DBG_BUGON(__lockref_is_dead(&grp->lockref));
- if (grp->lockref.count == 1)
- atomic_long_inc(&erofs_global_shrink_cnt);
- --grp->lockref.count;
- spin_unlock(&grp->lockref.lock);
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
-{
- int free = false;
-
- spin_lock(&grp->lockref.lock);
- if (grp->lockref.count)
- goto out;
-
- /*
- * Note that all cached pages should be detached before deleted from
- * the XArray. Otherwise some cached pages could be still attached to
- * the orphan old workgroup when the new one is available in the tree.
- */
- if (erofs_try_to_free_all_cached_pages(sbi, grp))
- goto out;
-
- /*
- * It's impossible to fail after the workgroup is freezed,
- * however in order to avoid some race conditions, add a
- * DBG_BUGON to observe this in advance.
- */
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
-
- lockref_mark_dead(&grp->lockref);
- free = true;
-out:
- spin_unlock(&grp->lockref.lock);
- if (free)
- __erofs_workgroup_free(grp);
- return free;
-}
-
-static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink)
-{
- struct erofs_workgroup *grp;
- unsigned int freed = 0;
- unsigned long index;
-
- xa_lock(&sbi->managed_pslots);
- xa_for_each(&sbi->managed_pslots, index, grp) {
- /* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp))
- continue;
- xa_unlock(&sbi->managed_pslots);
-
- ++freed;
- if (!--nr_shrink)
- return freed;
- xa_lock(&sbi->managed_pslots);
- }
- xa_unlock(&sbi->managed_pslots);
- return freed;
-}
-
-/* protected by 'erofs_sb_list_lock' */
-static unsigned int shrinker_run_no;
-
-/* protects the mounted 'erofs_sb_list' */
-static DEFINE_SPINLOCK(erofs_sb_list_lock);
-static LIST_HEAD(erofs_sb_list);
-
-void erofs_shrinker_register(struct super_block *sb)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
-
- mutex_init(&sbi->umount_mutex);
-
- spin_lock(&erofs_sb_list_lock);
- list_add(&sbi->list, &erofs_sb_list);
- spin_unlock(&erofs_sb_list_lock);
-}
-
-void erofs_shrinker_unregister(struct super_block *sb)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
-
- mutex_lock(&sbi->umount_mutex);
- /* clean up all remaining workgroups in memory */
- erofs_shrink_workstation(sbi, ~0UL);
-
- spin_lock(&erofs_sb_list_lock);
- list_del(&sbi->list);
- spin_unlock(&erofs_sb_list_lock);
- mutex_unlock(&sbi->umount_mutex);
-}
-
-static unsigned long erofs_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- return atomic_long_read(&erofs_global_shrink_cnt);
-}
-
-static unsigned long erofs_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct erofs_sb_info *sbi;
- struct list_head *p;
-
- unsigned long nr = sc->nr_to_scan;
- unsigned int run_no;
- unsigned long freed = 0;
-
- spin_lock(&erofs_sb_list_lock);
- do {
- run_no = ++shrinker_run_no;
- } while (run_no == 0);
-
- /* Iterate over all mounted superblocks and try to shrink them */
- p = erofs_sb_list.next;
- while (p != &erofs_sb_list) {
- sbi = list_entry(p, struct erofs_sb_info, list);
-
- /*
- * We move the ones we do to the end of the list, so we stop
- * when we see one we have already done.
- */
- if (sbi->shrinker_run_no == run_no)
- break;
-
- if (!mutex_trylock(&sbi->umount_mutex)) {
- p = p->next;
- continue;
- }
-
- spin_unlock(&erofs_sb_list_lock);
- sbi->shrinker_run_no = run_no;
-
- freed += erofs_shrink_workstation(sbi, nr - freed);
-
- spin_lock(&erofs_sb_list_lock);
- /* Get the next list element before we move this one */
- p = p->next;
-
- /*
- * Move this one to the end of the list to provide some
- * fairness.
- */
- list_move_tail(&sbi->list, &erofs_sb_list);
- mutex_unlock(&sbi->umount_mutex);
-
- if (freed >= nr)
- break;
- }
- spin_unlock(&erofs_sb_list_lock);
- return freed;
-}
-
-static struct shrinker *erofs_shrinker_info;
-
-int __init erofs_init_shrinker(void)
-{
- erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
- if (!erofs_shrinker_info)
- return -ENOMEM;
-
- erofs_shrinker_info->count_objects = erofs_shrink_count;
- erofs_shrinker_info->scan_objects = erofs_shrink_scan;
-
- shrinker_register(erofs_shrinker_info);
-
- return 0;
-}
-
-void erofs_exit_shrinker(void)
-{
- shrinker_free(erofs_shrinker_info);
-}
-#endif /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index b58316b49a43..df2777e05661 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -81,13 +81,13 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ ih = it.kaddr;
vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
@@ -102,16 +102,14 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos),
- EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)
- (it.kaddr + erofs_blkoff(sb, it.pos)));
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr);
it.pos += sizeof(__le32);
}
erofs_put_metabuf(&it.buf);
@@ -185,12 +183,11 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- src = it->kaddr + erofs_blkoff(sb, it->pos);
+ src = it->kaddr;
slice = min_t(unsigned int, sb->s_blocksize -
erofs_blkoff(sb, it->pos), len - processed);
memcpy(it->buffer + it->buffer_ofs, src, slice);
@@ -208,8 +205,7 @@ static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
int err;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(it->sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
base_index = entry.e_name_index;
@@ -259,8 +255,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
unsigned int slice, processed, value_sz;
/* 1. handle xattr entry */
- entry = *(struct erofs_xattr_entry *)
- (it->kaddr + erofs_blkoff(sb, it->pos));
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
it->pos += sizeof(struct erofs_xattr_entry);
value_sz = le16_to_cpu(entry.e_value_size);
@@ -291,8 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -300,7 +294,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
sb->s_blocksize - erofs_blkoff(sb, it->pos),
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
- it->kaddr + erofs_blkoff(sb, it->pos), slice))
+ it->kaddr, slice))
return -ENOATTR;
it->pos += slice;
}
@@ -336,13 +330,11 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
- entry_sz = erofs_xattr_entry_size(it->kaddr +
- erofs_blkoff(it->sb, it->pos));
+ entry_sz = erofs_xattr_entry_size(it->kaddr);
/* xattr on-disk corruption: xattr entry beyond xattr_isize */
if (remaining < entry_sz) {
DBG_BUGON(1);
@@ -375,8 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
for (i = 0; i < vi->xattr_shared_count; ++i) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos),
- EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -416,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
}
it.index = index;
- it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ it.name = QSTR(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
@@ -487,12 +478,12 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!sbi->xattr_prefix_count)
return 0;
- pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL);
+ pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL);
if (!pfs)
return -ENOMEM;
if (sbi->packed_inode)
- buf.inode = sbi->packed_inode;
+ buf.mapping = sbi->packed_inode->i_mapping;
else
erofs_init_metabuf(&buf, sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ff0aa72b0db3..d771e06db738 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -12,12 +12,6 @@
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
#define Z_EROFS_INLINE_BVECS 2
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
struct z_erofs_bvec {
struct page *page;
int offset;
@@ -44,11 +38,14 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_pcluster {
- struct erofs_workgroup obj;
struct mutex lock;
+ struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
+ struct z_erofs_pcluster *next;
+
+ /* I: start block address of this pcluster */
+ erofs_off_t index;
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -91,12 +88,11 @@ struct z_erofs_pcluster {
/* the end of a chain of pclusters */
#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
-#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
struct super_block *sb;
+ struct z_erofs_pcluster *head;
atomic_t pending_bios;
- z_erofs_next_pcluster_t head;
union {
struct completion done;
@@ -108,7 +104,7 @@ struct z_erofs_decompressqueue {
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
- return !pcl->obj.index;
+ return !pcl->index;
}
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
@@ -116,47 +112,9 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
-/*
- * bit 30: I/O error occurred on this page
- * bit 0 - 29: remaining parts to complete this page
- */
-#define Z_EROFS_PAGE_EIO (1 << 30)
-
-static inline void z_erofs_onlinepage_init(struct page *page)
-{
- union {
- atomic_t o;
- unsigned long v;
- } u = { .o = ATOMIC_INIT(1) };
-
- set_page_private(page, u.v);
- smp_wmb();
- SetPagePrivate(page);
-}
-
-static inline void z_erofs_onlinepage_split(struct page *page)
+static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
{
- atomic_inc((atomic_t *)&page->private);
-}
-
-static void z_erofs_onlinepage_endio(struct page *page, int err)
-{
- int orig, v;
-
- DBG_BUGON(!PagePrivate(page));
-
- do {
- orig = atomic_read((atomic_t *)&page->private);
- v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
-
- if (!(v & ~Z_EROFS_PAGE_EIO)) {
- set_page_private(page, 0);
- ClearPagePrivate(page);
- if (!(v & Z_EROFS_PAGE_EIO))
- SetPageUptodate(page);
- unlock_page(page);
- }
+ return fo->mapping == MNGD_MAPPING(sbi);
}
#define Z_EROFS_ONSTACK_PAGES 32
@@ -233,7 +191,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
+ nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
+ true);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -354,7 +313,7 @@ static void erofs_destroy_percpu_workers(void)
static struct kthread_worker *erofs_init_percpu_worker(int cpu)
{
struct kthread_worker *worker =
- kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+ kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
if (IS_ERR(worker))
return worker;
@@ -447,81 +406,81 @@ static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
#endif
-void z_erofs_exit_zip_subsystem(void)
+void z_erofs_exit_subsystem(void)
{
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool();
+ z_erofs_exit_decompressor();
}
-int __init z_erofs_init_zip_subsystem(void)
+int __init z_erofs_init_subsystem(void)
{
- int err = z_erofs_create_pcluster_pool();
+ int err = z_erofs_init_decompressor();
+
+ if (err)
+ goto err_decompressor;
+ err = z_erofs_create_pcluster_pool();
if (err)
- goto out_error_pcluster_pool;
+ goto err_pcluster_pool;
z_erofs_workqueue = alloc_workqueue("erofs_worker",
WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
if (!z_erofs_workqueue) {
err = -ENOMEM;
- goto out_error_workqueue_init;
+ goto err_workqueue_init;
}
err = erofs_init_percpu_workers();
if (err)
- goto out_error_pcpu_worker;
+ goto err_pcpu_worker;
err = erofs_cpu_hotplug_init();
if (err < 0)
- goto out_error_cpuhp_init;
+ goto err_cpuhp_init;
return err;
-out_error_cpuhp_init:
+err_cpuhp_init:
erofs_destroy_percpu_workers();
-out_error_pcpu_worker:
+err_pcpu_worker:
destroy_workqueue(z_erofs_workqueue);
-out_error_workqueue_init:
+err_workqueue_init:
z_erofs_destroy_pcluster_pool();
-out_error_pcluster_pool:
+err_pcluster_pool:
+ z_erofs_exit_decompressor();
+err_decompressor:
return err;
}
enum z_erofs_pclustermode {
+ /* It has previously been linked into another processing chain */
Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
- * could be dispatched into bypass queue later due to uptodated managed
- * pages. All related online pages cannot be reused for inplace I/O (or
- * bvpage) since it can be directly decoded without I/O submission.
+ * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
+ * may be dispatched to the bypass queue later due to uptodated managed
+ * folios. All file-backed folios related to this pcluster cannot be
+ * reused for in-place I/O (or bvpage) since the pcluster may be decoded
+ * in a separate queue (and thus out of order).
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
- * The pcluster was just linked to a decompression chain by us. It can
- * also be linked with the remaining pclusters, which means if the
- * processing page is the tail page of a pcluster, this pcluster can
- * safely use the whole page (since the previous pcluster is within the
- * same chain) for in-place I/O, as illustrated below:
- * ___________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (of the current pcl) | (of the previous pcl) |
- * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
- *
- * [ (*) the page above can be used as inplace I/O. ]
+ * The pcluster has just been linked to our processing chain.
+ * File-backed folios (except for the head page) related to it can be
+ * used for in-place I/O (or bvpage).
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
-struct z_erofs_decompress_frontend {
+struct z_erofs_frontend {
struct inode *const inode;
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
struct page *pagepool;
struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl;
- z_erofs_next_pcluster_t owned_head;
+ struct z_erofs_pcluster *pcl, *head;
enum z_erofs_pclustermode mode;
erofs_off_t headoffset;
@@ -530,11 +489,11 @@ struct z_erofs_decompress_frontend {
unsigned int icur;
};
-#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED }
+#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
+ .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
-static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
{
unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
@@ -551,19 +510,17 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
- bool standalone = true;
- /*
- * optimistic allocation without direct reclaim since inplace I/O
- * can be used if low memory otherwise.
- */
+ bool may_bypass = true;
+ /* Optimistic allocation, as in-place I/O can be used as a fallback */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ struct folio *folio, *newfolio;
unsigned int i;
if (i_blocksize(fe->inode) != PAGE_SIZE ||
@@ -571,86 +528,68 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
return;
for (i = 0; i < pclusterpages; ++i) {
- struct page *page, *newpage;
- void *t; /* mark pages just found for debugging */
-
/* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, pcl->obj.index + i);
- if (page) {
- t = (void *)((unsigned long)page | 1);
- newpage = NULL;
- } else {
- /* I/O is needed, no possible to decompress directly */
- standalone = false;
+ folio = filemap_get_folio(mc, pcl->index + i);
+ if (IS_ERR(folio)) {
+ may_bypass = false;
if (!shouldalloc)
continue;
/*
- * Try cached I/O if allocation succeeds or fallback to
- * in-place I/O instead to avoid any direct reclaim.
+ * Allocate a managed folio for cached I/O, or it may be
+ * then filled with a file-backed folio for in-place I/O
*/
- newpage = erofs_allocpage(&fe->pagepool, gfp);
- if (!newpage)
+ newfolio = filemap_alloc_folio(gfp, 0);
+ if (!newfolio)
continue;
- set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
- t = (void *)((unsigned long)newpage | 1);
+ newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
+ folio = NULL;
}
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
- pcl->compressed_bvecs[i].page = t;
- spin_unlock(&pcl->obj.lockref.lock);
+ pcl->compressed_bvecs[i].page =
+ folio_page(folio ?: newfolio, 0);
+ spin_unlock(&pcl->lockref.lock);
continue;
}
- spin_unlock(&pcl->obj.lockref.lock);
-
- if (page)
- put_page(page);
- else if (newpage)
- erofs_pagepool_add(&fe->pagepool, newpage);
+ spin_unlock(&pcl->lockref.lock);
+ folio_put(folio ?: newfolio);
}
/*
- * don't do inplace I/O if all compressed pages are available in
- * managed cache since it can be moved to the bypass queue instead.
+ * Don't perform in-place I/O if all compressed pages are available in
+ * the managed cache, as the pcluster can be moved to the bypass queue.
*/
- if (standalone)
+ if (may_bypass)
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
-/* called by erofs_shrinker to get rid of all compressed_pages */
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+/* (erofs_shrinker) disconnect cached encoded data with pclusters */
+static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ struct folio *folio;
int i;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- /*
- * refcount of workgroup is now freezed as 0,
- * therefore no need to worry about available decompression users.
- */
+ /* Each cached folio contains one page unless bs > ps is supported */
for (i = 0; i < pclusterpages; ++i) {
- struct page *page = pcl->compressed_bvecs[i].page;
-
- if (!page)
- continue;
-
- /* block other users from reclaiming or migrating the page */
- if (!trylock_page(page))
- return -EBUSY;
+ if (pcl->compressed_bvecs[i].page) {
+ folio = page_folio(pcl->compressed_bvecs[i].page);
+ /* Avoid reclaiming or migrating this folio */
+ if (!folio_trylock(folio))
+ return -EBUSY;
- if (!erofs_page_is_managed(sbi, page))
- continue;
-
- /* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- detach_page_private(page);
- unlock_page(page);
+ if (!erofs_folio_is_managed(sbi, folio))
+ continue;
+ pcl->compressed_bvecs[i].page = NULL;
+ folio_detach_private(folio);
+ folio_unlock(folio);
+ }
}
return 0;
}
@@ -658,30 +597,27 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
struct z_erofs_pcluster *pcl = folio_get_private(folio);
- unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
+ struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
bool ret;
- int i;
if (!folio_test_private(folio))
return true;
ret = false;
- spin_lock(&pcl->obj.lockref.lock);
- if (pcl->obj.lockref.count > 0)
- goto out;
-
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].page == &folio->page) {
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- ret = true;
- break;
+ spin_lock(&pcl->lockref.lock);
+ if (pcl->lockref.count <= 0) {
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (; bvec < end; ++bvec) {
+ if (bvec->page && page_folio(bvec->page) == folio) {
+ bvec->page = NULL;
+ folio_detach_private(folio);
+ ret = true;
+ break;
+ }
}
}
- if (ret)
- folio_detach_private(folio);
-out:
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return ret;
}
@@ -724,7 +660,7 @@ int erofs_init_managed_cache(struct super_block *sb)
}
/* callers must be with pcluster lock held */
-static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+static int z_erofs_attach_page(struct z_erofs_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -732,15 +668,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
while (fe->icur > 0) {
if (pcl->compressed_bvecs[--fe->icur].page)
continue;
pcl->compressed_bvecs[fe->icur] = *bvec;
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return 0;
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@@ -753,31 +689,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
+static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *pcl = f->pcl;
- z_erofs_next_pcluster_t *owned_head = &f->owned_head;
-
- /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
- *owned_head) == Z_EROFS_PCLUSTER_NIL) {
- *owned_head = &pcl->next;
- /* so we can attach this pcluster to our submission chain. */
- f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
- return;
+ if (lockref_get_not_zero(&pcl->lockref))
+ return true;
+
+ spin_lock(&pcl->lockref.lock);
+ if (__lockref_is_dead(&pcl->lockref)) {
+ spin_unlock(&pcl->lockref.lock);
+ return false;
}
- /* type 2, it belongs to an ongoing chain */
- f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+ if (!pcl->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&pcl->lockref.lock);
+ return true;
}
-static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
bool ztailpacking = map->m_flags & EROFS_MAP_META;
- struct z_erofs_pcluster *pcl;
- struct erofs_workgroup *grp;
+ struct z_erofs_pcluster *pcl, *pre;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
@@ -791,14 +726,11 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- spin_lock_init(&pcl->obj.lockref.lock);
- pcl->obj.lockref.count = 1; /* one ref for this request */
+ lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
-
- /* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = fe->owned_head;
+ pcl->next = fe->head;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
@@ -810,25 +742,31 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
- pcl->obj.index = 0; /* which indicates ztailpacking */
+ pcl->index = 0; /* which indicates ztailpacking */
} else {
- pcl->obj.index = erofs_blknr(sb, map->m_pa);
-
- grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
+ pcl->index = erofs_blknr(sb, map->m_pa);
+ while (1) {
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
+ NULL, pcl, GFP_KERNEL);
+ if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
+ xa_unlock(&sbi->managed_pslots);
+ break;
+ }
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
}
-
- if (grp != &pcl->obj) {
- fe->pcl = container_of(grp,
- struct z_erofs_pcluster, obj);
+ if (xa_is_err(pre)) {
+ err = xa_err(pre);
+ goto err_out;
+ } else if (pre) {
+ fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
}
- fe->owned_head = &pcl->next;
- fe->pcl = pcl;
+ fe->head = fe->pcl = pcl;
return 0;
err_out:
@@ -837,28 +775,36 @@ err_out:
return err;
}
-static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
- struct erofs_workgroup *grp = NULL;
+ struct z_erofs_pcluster *pcl = NULL;
int ret;
DBG_BUGON(fe->pcl);
-
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(!fe->head);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(sb, blknr);
+ while (1) {
+ rcu_read_lock();
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
+ if (!pcl || z_erofs_get_pcluster(pcl)) {
+ DBG_BUGON(pcl && blknr != pcl->index);
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ }
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- if (grp) {
- fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl) {
+ fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@@ -866,7 +812,14 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- z_erofs_try_to_claim_pcluster(fe);
+ /* check if this pcluster hasn't been linked into any chain. */
+ if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
+ /* .. so it can be attached to our submission chain */
+ fe->head = fe->pcl;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+ } else { /* otherwise, it belongs to an inflight chain */
+ fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+ }
} else if (ret) {
return ret;
}
@@ -879,7 +832,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
} else {
void *mptr;
- mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+ mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
if (IS_ERR(mptr)) {
ret = PTR_ERR(mptr);
erofs_err(sb, "failed to get inline data %d", ret);
@@ -895,25 +848,93 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
return 0;
}
-/*
- * keep in mind that no referenced pclusters will be freed
- * only after a RCU grace period.
- */
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- z_erofs_free_pcluster(container_of(head,
- struct z_erofs_pcluster, rcu));
+ z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
+}
+
+static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ if (pcl->lockref.count)
+ return false;
+
+ /*
+ * Note that all cached folios should be detached before deleted from
+ * the XArray. Otherwise some folios could be still attached to the
+ * orphan old pcluster when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_folios(sbi, pcl))
+ return false;
+
+ /*
+ * It's impossible to fail after the pcluster is freezed, but in order
+ * to avoid some race conditions, add a DBG_BUGON to observe this.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
+
+ lockref_mark_dead(&pcl->lockref);
+ return true;
+}
+
+static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ bool free;
+
+ spin_lock(&pcl->lockref.lock);
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ spin_unlock(&pcl->lockref.lock);
+ if (free) {
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ }
+ return free;
}
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ struct z_erofs_pcluster *pcl;
+ unsigned long index, freed = 0;
- call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, pcl) {
+ /* try to shrink each valid pcluster */
+ if (!erofs_try_to_release_pcluster(sbi, pcl))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
+}
+
+static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl, bool try_free)
+{
+ bool free = false;
+
+ if (lockref_put_or_lock(&pcl->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&pcl->lockref));
+ if (!--pcl->lockref.count) {
+ if (try_free && xa_trylock(&sbi->managed_pslots)) {
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ xa_unlock(&sbi->managed_pslots);
+ }
+ atomic_long_add(!free, &erofs_global_shrink_cnt);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ if (free)
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
-static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -926,17 +947,13 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
if (fe->candidate_bvpage)
fe->candidate_bvpage = NULL;
- /*
- * if all pending pages are added, don't hold its reference
- * any longer if the pcluster isn't hosted by ourselves.
- */
+ /* Drop refcount if it doesn't belong to our processing chain */
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
- erofs_workgroup_put(&pcl->obj);
-
+ z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
}
-static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
unsigned int cur, unsigned int end, erofs_off_t pos)
{
struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
@@ -947,116 +964,112 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
if (!packed_inode)
return -EFSCORRUPTED;
- buf.inode = packed_inode;
+ buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
- cnt = min_t(unsigned int, end - cur,
- sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
+ cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
+ src = erofs_bread(&buf, pos, EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
- memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
+ memcpy_to_folio(folio, cur, src, cnt);
}
erofs_put_metabuf(&buf);
return 0;
}
-static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, bool ra)
+static int z_erofs_scan_folio(struct z_erofs_frontend *f,
+ struct folio *folio, bool ra)
{
- struct inode *const inode = fe->inode;
- struct erofs_map_blocks *const map = &fe->map;
- const loff_t offset = page_offset(page);
+ struct inode *const inode = f->inode;
+ struct erofs_map_blocks *const map = &f->map;
+ const loff_t offset = folio_pos(folio);
const unsigned int bs = i_blocksize(inode);
- bool tight = true, exclusive;
- unsigned int cur, end, len, split;
+ unsigned int end = folio_size(folio), split = 0, cur, pgs;
+ bool tight, excl;
int err = 0;
- z_erofs_onlinepage_init(page);
- split = 0;
- end = PAGE_SIZE;
-repeat:
- if (offset + end - 1 < map->m_la ||
- offset + end - 1 >= map->m_la + map->m_llen) {
- z_erofs_pcluster_end(fe);
- map->m_la = offset + end - 1;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto out;
- }
-
- cur = offset > map->m_la ? 0 : map->m_la - offset;
- /* bump split parts first to avoid several separate cases */
- ++split;
-
- if (!(map->m_flags & EROFS_MAP_MAPPED)) {
- zero_user_segment(page, cur, end);
- tight = false;
- goto next_part;
- }
-
- if (map->m_flags & EROFS_MAP_FRAGMENT) {
- erofs_off_t fpos = offset + cur - map->m_la;
-
- len = min_t(unsigned int, map->m_llen - fpos, end - cur);
- err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
- EROFS_I(inode)->z_fragmentoff + fpos);
- if (err)
- goto out;
- tight = false;
- goto next_part;
- }
+ tight = (bs == PAGE_SIZE);
+ erofs_onlinefolio_init(folio);
+ do {
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(f);
+ map->m_la = offset + end - 1;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ break;
+ }
- if (!fe->pcl) {
- err = z_erofs_pcluster_begin(fe);
- if (err)
- goto out;
- fe->pcl->besteffort |= !ra;
- }
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ pgs = round_down(cur, PAGE_SIZE);
+ /* bump split parts first to avoid several separate cases */
+ ++split;
+
+ if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, end);
+ tight = false;
+ } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ erofs_off_t fpos = offset + cur - map->m_la;
+
+ err = z_erofs_read_fragment(inode->i_sb, folio, cur,
+ cur + min(map->m_llen - fpos, end - cur),
+ EROFS_I(inode)->z_fragmentoff + fpos);
+ if (err)
+ break;
+ tight = false;
+ } else {
+ if (!f->pcl) {
+ err = z_erofs_pcluster_begin(f);
+ if (err)
+ break;
+ f->pcl->besteffort |= !ra;
+ }
- /*
- * Ensure the current partial page belongs to this submit chain rather
- * than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or bvpage (should be processed in a strict order.)
- */
- tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE)));
- if (cur)
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
-
- err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
- .page = page,
- .offset = offset - map->m_la,
- .end = end,
- }), exclusive);
- if (err)
- goto out;
-
- z_erofs_onlinepage_split(page);
- if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- fe->pcl->multibases = true;
- if (fe->pcl->length < offset + end - map->m_la) {
- fe->pcl->length = offset + end - map->m_la;
- fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
- }
- if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
- !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
- fe->pcl->length == map->m_llen)
- fe->pcl->partial = false;
-next_part:
- /* shorten the remaining extent to update progress */
- map->m_llen = offset + cur - map->m_la;
- map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
-
- end = cur;
- if (end > 0)
- goto repeat;
+ pgs = round_down(end - 1, PAGE_SIZE);
+ /*
+ * Ensure this partial page belongs to this submit chain
+ * rather than other concurrent submit chains or
+ * noio(bypass) chains since those chains are handled
+ * asynchronously thus it cannot be used for inplace I/O
+ * or bvpage (should be processed in the strict order.)
+ */
+ tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
+ excl = false;
+ if (cur <= pgs) {
+ excl = (split <= 1) || tight;
+ cur = pgs;
+ }
-out:
- z_erofs_onlinepage_endio(page, err);
+ err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
+ .page = folio_page(folio, pgs >> PAGE_SHIFT),
+ .offset = offset + pgs - map->m_la,
+ .end = end - pgs, }), excl);
+ if (err)
+ break;
+
+ erofs_onlinefolio_split(folio);
+ if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
+ f->pcl->multibases = true;
+ if (f->pcl->length < offset + end - map->m_la) {
+ f->pcl->length = offset + end - map->m_la;
+ f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ f->pcl->length == map->m_llen)
+ f->pcl->partial = false;
+ }
+ /* shorten the remaining extent to update progress */
+ map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
+ if (cur <= pgs) {
+ split = cur < pgs;
+ tight = (bs == PAGE_SIZE);
+ }
+ } while ((end = cur) > 0);
+ erofs_onlinefolio_end(folio, err);
return err;
}
@@ -1077,10 +1090,10 @@ static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
static bool z_erofs_page_is_invalidated(struct page *page)
{
- return !page->mapping && !z_erofs_is_shortlived_page(page);
+ return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
}
-struct z_erofs_decompress_backend {
+struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
struct z_erofs_pcluster *pcl;
@@ -1100,7 +1113,7 @@ struct z_erofs_bvec_item {
struct list_head list;
};
-static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
struct z_erofs_bvec *bvec)
{
struct z_erofs_bvec_item *item;
@@ -1123,8 +1136,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
list_add(&item->list, &be->decompressed_secondary_bvecs);
}
-static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
- int err)
+static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
{
unsigned int off0 = be->pcl->pageofs_out;
struct list_head *p, *n;
@@ -1159,13 +1171,13 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- z_erofs_onlinepage_endio(bvi->bvec.page, err);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
list_del(p);
kfree(bvi);
}
}
-static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
{
struct z_erofs_pcluster *pcl = be->pcl;
struct z_erofs_bvec_iter biter;
@@ -1190,8 +1202,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
}
-static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
- bool *overlapped)
+static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
{
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
@@ -1202,15 +1213,16 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
struct page *page = bvec->page;
- /* compressed data ought to be valid before decompressing */
- if (!page) {
- err = -EIO;
+ /* compressed data ought to be valid when decompressing */
+ if (IS_ERR(page) || !page) {
+ bvec->page = NULL; /* clear the failure reason */
+ err = page ? PTR_ERR(page) : -EIO;
continue;
}
be->compressed_pages[i] = page;
if (z_erofs_is_inline_pcluster(pcl) ||
- erofs_page_is_managed(EROFS_SB(be->sb), page)) {
+ erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
continue;
@@ -1225,17 +1237,17 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
return err;
}
-static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
- int err)
+static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
{
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
const struct z_erofs_decompressor *decomp =
- &erofs_decompressors[pcl->algorithmformat];
- int i, err2;
+ z_erofs_decomp[pcl->algorithmformat];
+ int i, j, jtop, err2;
struct page *page;
bool overlapped;
+ bool try_free = true;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1280,8 +1292,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
.fillgaps = pcl->multibases,
- .gfp = pcl->besteffort ?
- GFP_KERNEL | __GFP_NOFAIL :
+ .gfp = pcl->besteffort ? GFP_KERNEL :
GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
@@ -1291,12 +1302,15 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
} else {
+ /* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
- /* consider shortlived pages added when decompressing */
page = be->compressed_pages[i];
-
- if (!page || erofs_page_is_managed(sbi, page))
+ if (!page)
+ continue;
+ if (erofs_folio_is_managed(sbi, page_folio(page))) {
+ try_free = false;
continue;
+ }
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@@ -1304,21 +1318,31 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
if (be->compressed_pages < be->onstack_pages ||
be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
kvfree(be->compressed_pages);
- z_erofs_fill_other_copies(be, err);
+ jtop = 0;
+ z_erofs_fill_other_copies(be, err);
for (i = 0; i < be->nr_pages; ++i) {
page = be->decompressed_pages[i];
if (!page)
continue;
DBG_BUGON(z_erofs_page_is_invalidated(page));
-
- /* recycle all individual short-lived pages */
- if (z_erofs_put_shortlivedpage(be->pagepool, page))
+ if (!z_erofs_is_shortlived_page(page)) {
+ erofs_onlinefolio_end(page_folio(page), err);
+ continue;
+ }
+ if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
+ erofs_pagepool_add(be->pagepool, page);
continue;
- z_erofs_onlinepage_endio(page, err);
+ }
+ for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
+ ;
+ if (j >= jtop) /* this bounce page is newly detected */
+ be->decompressed_pages[jtop++] = page;
}
-
+ while (jtop)
+ erofs_pagepool_add(be->pagepool,
+ be->decompressed_pages[--jtop]);
if (be->decompressed_pages != be->onstack_pages)
kvfree(be->decompressed_pages);
@@ -1330,34 +1354,35 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
pcl->vcnt = 0;
/* pcluster lock MUST be taken before the following line */
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+ WRITE_ONCE(pcl->next, NULL);
mutex_unlock(&pcl->lock);
+
+ if (z_erofs_is_inline_pcluster(pcl))
+ z_erofs_free_pcluster(pcl);
+ else
+ z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
-static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct page **pagepool)
+static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct page **pagepool)
{
- struct z_erofs_decompress_backend be = {
+ struct z_erofs_backend be = {
.sb = io->sb,
.pagepool = pagepool,
.decompressed_secondary_bvecs =
LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ .pcl = io->head,
};
- z_erofs_next_pcluster_t owned = io->head;
-
- while (owned != Z_EROFS_PCLUSTER_TAIL) {
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
-
- be.pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(be.pcl->next);
+ struct z_erofs_pcluster *next;
+ int err = io->eio ? -EIO : 0;
- z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
- if (z_erofs_is_inline_pcluster(be.pcl))
- z_erofs_free_pcluster(be.pcl);
- else
- erofs_workgroup_put(&be.pcl->obj);
+ for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
+ DBG_BUGON(!be.pcl);
+ next = READ_ONCE(be.pcl->next);
+ err = z_erofs_decompress_pcluster(&be, err) ?: err;
}
+ return err;
}
static void z_erofs_decompressqueue_work(struct work_struct *work)
@@ -1420,7 +1445,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
}
static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
- struct z_erofs_decompress_frontend *f,
+ struct z_erofs_frontend *f,
struct z_erofs_pcluster *pcl,
unsigned int nr,
struct address_space *mc)
@@ -1429,39 +1454,33 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
bool tocache = false;
struct z_erofs_bvec zbv;
struct address_space *mapping;
+ struct folio *folio;
struct page *page;
- int justfound, bs = i_blocksize(f->inode);
+ int bs = i_blocksize(f->inode);
- /* Except for inplace pages, the entire page can be used for I/Os */
+ /* Except for inplace folios, the entire folio can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
zbv = pcl->compressed_bvecs[nr];
- page = zbv.page;
- justfound = (unsigned long)page & 1UL;
- page = (struct page *)((unsigned long)page & ~1UL);
- pcl->compressed_bvecs[nr].page = page;
- spin_unlock(&pcl->obj.lockref.lock);
- if (!page)
- goto out_allocpage;
+ spin_unlock(&pcl->lockref.lock);
+ if (!zbv.page)
+ goto out_allocfolio;
- bvec->bv_page = page;
- DBG_BUGON(z_erofs_is_shortlived_page(page));
- /*
- * Handle preallocated cached pages. We tried to allocate such pages
- * without triggering direct reclaim. If allocation failed, inplace
- * file-backed pages will be used instead.
- */
- if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- set_page_private(page, 0);
+ bvec->bv_page = zbv.page;
+ DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
+
+ folio = page_folio(zbv.page);
+ /* For preallocated managed folios, add them to page cache here */
+ if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) {
tocache = true;
goto out_tocache;
}
- mapping = READ_ONCE(page->mapping);
+ mapping = READ_ONCE(folio->mapping);
/*
- * File-backed pages for inplace I/Os are all locked steady,
+ * File-backed folios for inplace I/Os are all locked steady,
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
@@ -1471,63 +1490,62 @@ repeat:
return;
}
- lock_page(page);
- /* only true if page reclaim goes wrong, should never happen */
- DBG_BUGON(justfound && PagePrivate(page));
-
- /* the cached page is still in managed cache */
- if (page->mapping == mc) {
+ folio_lock(folio);
+ if (likely(folio->mapping == mc)) {
/*
- * The cached page is still available but without a valid
- * `->private` pcluster hint. Let's reconnect them.
+ * The cached folio is still in managed cache but without
+ * a valid `->private` pcluster hint. Let's reconnect them.
*/
- if (!PagePrivate(page)) {
- DBG_BUGON(!justfound);
- /* compressed_bvecs[] already takes a ref */
- attach_page_private(page, pcl);
- put_page(page);
+ if (!folio_test_private(folio)) {
+ folio_attach_private(folio, pcl);
+ /* compressed_bvecs[] already takes a ref before */
+ folio_put(folio);
}
-
- /* no need to submit if it is already up-to-date */
- if (PageUptodate(page)) {
- unlock_page(page);
- bvec->bv_page = NULL;
+ if (likely(folio->private == pcl)) {
+ /* don't submit cache I/Os again if already uptodate */
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ bvec->bv_page = NULL;
+ }
+ return;
}
- return;
+ /*
+ * Already linked with another pcluster, which only appears in
+ * crafted images by fuzzers for now. But handle this anyway.
+ */
+ tocache = false; /* use temporary short-lived pages */
+ } else {
+ DBG_BUGON(1); /* referenced managed folios can't be truncated */
+ tocache = true;
}
-
- /*
- * It has been truncated, so it's unsafe to reuse this one. Let's
- * allocate a new page for compressed data.
- */
- DBG_BUGON(page->mapping);
- DBG_BUGON(!justfound);
-
- tocache = true;
- unlock_page(page);
- put_page(page);
-out_allocpage:
- page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
- spin_lock(&pcl->obj.lockref.lock);
- if (pcl->compressed_bvecs[nr].page) {
- erofs_pagepool_add(&f->pagepool, page);
- spin_unlock(&pcl->obj.lockref.lock);
+ folio_unlock(folio);
+ folio_put(folio);
+out_allocfolio:
+ page = __erofs_allocpage(&f->pagepool, gfp, true);
+ spin_lock(&pcl->lockref.lock);
+ if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+ if (page)
+ erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
- pcl->compressed_bvecs[nr].page = page;
- spin_unlock(&pcl->obj.lockref.lock);
+ pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
+ spin_unlock(&pcl->lockref.lock);
bvec->bv_page = page;
+ if (!page)
+ return;
+ folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) {
- /* turn into a temporary shortlived page (1 ref) */
- set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+ filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
+ /* turn into a temporary shortlived folio (1 ref) */
+ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
}
- attach_page_private(page, pcl);
+ folio_attach_private(folio, pcl);
/* drop a refcount added by allocpage (then 2 refs in total here) */
- put_page(page);
+ folio_put(folio);
}
static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1567,57 +1585,52 @@ enum {
NR_JOBQUEUES,
};
-static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
- z_erofs_next_pcluster_t qtail[],
- z_erofs_next_pcluster_t owned_head)
+static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
+ struct z_erofs_pcluster *next,
+ struct z_erofs_pcluster **qtail[])
{
- z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
- z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
-
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
-
- WRITE_ONCE(*submit_qtail, owned_head);
- WRITE_ONCE(*bypass_qtail, &pcl->next);
-
+ WRITE_ONCE(*qtail[JQ_SUBMIT], next);
+ WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
qtail[JQ_BYPASS] = &pcl->next;
}
-static void z_erofs_submissionqueue_endio(struct bio *bio)
+static void z_erofs_endio(struct bio *bio)
{
struct z_erofs_decompressqueue *q = bio->bi_private;
blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
+ DBG_BUGON(folio_test_uptodate(folio));
+ DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
+ if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
+ continue;
+
+ if (!err)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
if (err)
q->eio = true;
z_erofs_decompress_kickoff(q, -1);
- bio_put(bio);
+ if (bio->bi_bdev)
+ bio_put(bio);
}
-static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
+static void z_erofs_submit_queue(struct z_erofs_frontend *f,
struct z_erofs_decompressqueue *fgq,
bool *force_fg, bool readahead)
{
struct super_block *sb = f->inode->i_sb;
struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
- z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+ struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
- z_erofs_next_pcluster_t owned_head = f->owned_head;
+ struct z_erofs_pcluster *pcl, *next;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
erofs_off_t last_pa;
- struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
unsigned long pflags;
@@ -1631,42 +1644,42 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
/* by default, all need io submission */
- q[JQ_SUBMIT]->head = owned_head;
+ q[JQ_SUBMIT]->head = next = f->head;
do {
struct erofs_map_dev mdev;
- struct z_erofs_pcluster *pcl;
erofs_off_t cur, end;
struct bio_vec bvec;
unsigned int i = 0;
bool bypass = true;
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
- pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- owned_head = READ_ONCE(pcl->next);
-
+ pcl = next;
+ next = READ_ONCE(pcl->next);
if (z_erofs_is_inline_pcluster(pcl)) {
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
continue;
}
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->obj.index),
+ .m_pa = erofs_pos(sb, pcl->index),
};
(void)erofs_map_dev(sb, &mdev);
cur = mdev.m_pa;
end = cur + pcl->pclustersize;
do {
- z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
- if (!bvec.bv_page)
- continue;
-
+ bvec.bv_page = NULL;
if (bio && (cur != last_pa ||
- last_bdev != mdev.m_bdev)) {
-submit_bio_retry:
- submit_bio(bio);
+ bio->bi_bdev != mdev.m_bdev)) {
+drain_io:
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
+ erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
+
if (memstall) {
psi_memstall_leave(&pflags);
memstall = 0;
@@ -1674,6 +1687,15 @@ submit_bio_retry:
bio = NULL;
}
+ if (!bvec.bv_page) {
+ z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+ if (!bvec.bv_page)
+ continue;
+ if (cur + bvec.bv_len > end)
+ bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+ }
+
if (unlikely(PageWorkingset(bvec.bv_page)) &&
!memstall) {
psi_memstall_enter(&pflags);
@@ -1681,24 +1703,24 @@ submit_bio_retry:
}
if (!bio) {
- bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
- REQ_OP_READ, GFP_NOIO);
- bio->bi_end_io = z_erofs_submissionqueue_endio;
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ bio = erofs_fileio_bio_alloc(&mdev);
+ else if (erofs_is_fscache_mode(sb))
+ bio = erofs_fscache_bio_alloc(&mdev);
+ else
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
+ bio->bi_end_io = z_erofs_endio;
bio->bi_iter.bi_sector = cur >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
- last_bdev = mdev.m_bdev;
}
- if (cur + bvec.bv_len > end)
- bvec.bv_len = end - cur;
- DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
- goto submit_bio_retry;
-
+ goto drain_io;
last_pa = cur + bvec.bv_len;
bypass = false;
} while ((cur += bvec.bv_len) < end);
@@ -1706,14 +1728,19 @@ submit_bio_retry:
if (!bypass)
qtail[JQ_SUBMIT] = &pcl->next;
else
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
- } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
+ } while (next != Z_EROFS_PCLUSTER_TAIL);
if (bio) {
- submit_bio(bio);
- if (memstall)
- psi_memstall_leave(&pflags);
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
+ erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
}
+ if (memstall)
+ psi_memstall_leave(&pflags);
/*
* although background is preferred, no one is pending for submission.
@@ -1726,33 +1753,34 @@ submit_bio_retry:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
}
-static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- bool force_fg, bool ra)
+static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
+ struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
+ bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
+ int err;
- if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
- return;
- z_erofs_submit_queue(f, io, &force_fg, ra);
+ if (f->head == Z_EROFS_PCLUSTER_TAIL)
+ return 0;
+ z_erofs_submit_queue(f, io, &force_fg, !!rapages);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
-
+ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
- return;
+ return err;
/* wait until all bios are completed */
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
+ return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
}
/*
* Since partial uptodate is still unimplemented for now, we have to use
* approximate readmore strategies as a start.
*/
-static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
@@ -1780,7 +1808,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
-
if (!map->m_llen)
return;
}
@@ -1788,15 +1815,15 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
cur = map->m_la + map->m_llen - 1;
while ((cur >= end) && (cur < i_size_read(inode))) {
pgoff_t index = cur >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
- page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
- if (page) {
- if (PageUptodate(page))
- unlock_page(page);
+ folio = erofs_grab_folio_nowait(inode->i_mapping, index);
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_uptodate(folio))
+ folio_unlock(folio);
else
- (void)z_erofs_do_read_page(f, page, !!rac);
- put_page(page);
+ z_erofs_scan_folio(f, folio, !!rac);
+ folio_put(folio);
}
if (cur < PAGE_SIZE)
@@ -1808,21 +1835,17 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
int err;
trace_erofs_read_folio(folio, false);
- f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
-
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page, false);
+ err = z_erofs_scan_folio(&f, folio, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
- /* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
-
+ /* if some pclusters are ready, need submit them anyway */
+ err = z_erofs_runqueue(&f, 0) ?: err;
if (err && err != -EINTR)
erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
err, folio->index, EROFS_I(inode)->nid);
@@ -1835,18 +1858,14 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
struct folio *head = NULL, *folio;
- unsigned int nr_folios;
+ unsigned int nrpages = readahead_count(rac);
int err;
- f.headoffset = readahead_pos(rac);
-
z_erofs_pcluster_readmore(&f, rac, true);
- nr_folios = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
-
+ nrpages = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nrpages, false);
while ((folio = readahead_folio(rac))) {
folio->private = head;
head = folio;
@@ -1857,7 +1876,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page, true);
+ err = z_erofs_scan_folio(&f, folio, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
@@ -1865,7 +1884,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
z_erofs_pcluster_readmore(&f, rac, false);
z_erofs_pcluster_end(&f);
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
+ (void)z_erofs_runqueue(&f, nrpages);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index e313c936351d..689437e99a5a 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -4,14 +4,12 @@
* https://www.huawei.com/
*/
#include "internal.h"
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <trace/events/erofs.h>
struct z_erofs_maprecorder {
struct inode *inode;
struct erofs_map_blocks *map;
- void *kaddr;
-
unsigned long lcn;
/* compression extent information gathered */
u8 type, headtype;
@@ -31,22 +29,17 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
vi->inode_isize + vi->xattr_isize) +
lcn * sizeof(struct z_erofs_lcluster_index);
struct z_erofs_lcluster_index *di;
- unsigned int advise, type;
-
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
+ unsigned int advise;
- m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
m->lcn = lcn;
- di = m->kaddr + erofs_blkoff(inode->i_sb, pos);
+ m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
advise = le16_to_cpu(di->di_advise);
- type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) &
- ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1);
- switch (type) {
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
@@ -55,29 +48,19 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedblks = m->delta[0] &
- ~Z_EROFS_LI_D0_CBLKCNT;
+ m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT;
m->delta[0] = 1;
}
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
- break;
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
- if (advise & Z_EROFS_LI_PARTIAL_REF)
- m->partialref = true;
+ } else {
+ m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
m->pblk = le32_to_cpu(di->di_u.blkaddr);
- break;
- default:
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- m->type = type;
return 0;
}
@@ -114,17 +97,48 @@ static int get_compacted_la_distance(unsigned int lobits,
return d1;
}
-static int unpack_compacted_index(struct z_erofs_maprecorder *m,
- unsigned int amortizedshift,
- erofs_off_t pos, bool lookahead)
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn, bool lookahead)
{
- struct erofs_inode *const vi = EROFS_I(m->inode);
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
+ ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs;
- int i;
+ const unsigned int totalidx = erofs_iblks(inode);
+ unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
+ unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
+ bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ erofs_off_t pos;
u8 *in, type;
- bool big_pcluster;
+ int i;
+ if (lcn >= totalidx || lclusterbits > 14)
+ return -EINVAL;
+
+ m->lcn = lcn;
+ /* used to align to 32-byte (compacted_2b) alignment */
+ compacted_4b_initial = ((32 - ebase % 32) / 4) & 7;
+ compacted_2b = 0;
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
+ compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+
+ pos = ebase;
+ amortizedshift = 2; /* compact_4b */
+ if (lcn >= compacted_4b_initial) {
+ pos += compacted_4b_initial * 4;
+ lcn -= compacted_4b_initial;
+ if (lcn < compacted_2b) {
+ amortizedshift = 1;
+ } else {
+ pos += compacted_2b * 2;
+ lcn -= compacted_2b;
+ }
+ }
+ pos += lcn * (1 << amortizedshift);
+
+ /* figure out the lcluster count in this pack */
if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
else if (1 << amortizedshift == 2 && lclusterbits <= 12)
@@ -132,17 +146,18 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
+ in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP);
+ if (IS_ERR(in))
+ return PTR_ERR(in);
+
/* it doesn't equal to round_up(..) */
m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
(vcnt << amortizedshift);
- big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
- eofs = erofs_blkoff(m->inode->i_sb, pos);
- base = round_down(eofs, vcnt << amortizedshift);
- in = m->kaddr + base;
-
- i = (eofs - base) >> amortizedshift;
+ bytes = pos & ((vcnt << amortizedshift) - 1);
+ in -= bytes;
+ i = bytes >> amortizedshift;
lo = decode_compactedbits(lobits, in, encodebits * i, &type);
m->type = type;
@@ -222,57 +237,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
return 0;
}
-static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
- unsigned long lcn, bool lookahead)
-{
- struct inode *const inode = m->inode;
- struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
- ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- unsigned int totalidx = erofs_iblks(inode);
- unsigned int compacted_4b_initial, compacted_2b;
- unsigned int amortizedshift;
- erofs_off_t pos;
-
- if (lcn >= totalidx)
- return -EINVAL;
-
- m->lcn = lcn;
- /* used to align to 32-byte (compacted_2b) alignment */
- compacted_4b_initial = (32 - ebase % 32) / 4;
- if (compacted_4b_initial == 32 / 4)
- compacted_4b_initial = 0;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
- compacted_4b_initial < totalidx)
- compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
- else
- compacted_2b = 0;
-
- pos = ebase;
- if (lcn < compacted_4b_initial) {
- amortizedshift = 2;
- goto out;
- }
- pos += compacted_4b_initial * 4;
- lcn -= compacted_4b_initial;
-
- if (lcn < compacted_2b) {
- amortizedshift = 1;
- goto out;
- }
- pos += compacted_2b * 2;
- lcn -= compacted_2b;
- amortizedshift = 2;
-out:
- pos += lcn * (1 << amortizedshift);
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(inode->i_sb, pos), EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
- return unpack_compacted_index(m, amortizedshift, pos, lookahead);
-}
-
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
@@ -330,27 +294,23 @@ err_bogus:
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
unsigned int initial_lcn)
{
- struct super_block *sb = m->inode->i_sb;
- struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn;
+ struct inode *inode = m->inode;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2;
+ unsigned long lcn = m->lcn + 1;
int err;
- DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 &&
- m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
DBG_BUGON(m->type != m->headtype);
- if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
- ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1ULL << lclusterbits;
- return 0;
- }
- lcn = m->lcn + 1;
+ if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
+ ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+ m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
+ (lcn << vi->z_logical_clusterbits) >= inode->i_size)
+ m->compressedblks = 1;
+
if (m->compressedblks)
goto out;
@@ -375,9 +335,9 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
case Z_EROFS_LCLUSTER_TYPE_HEAD2:
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+ * rather than CBLKCNT, it's a 1 block-sized pcluster.
*/
- m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits);
+ m->compressedblks = 1;
break;
case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
if (m->delta[0] != 1)
@@ -392,7 +352,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = erofs_pos(sb, m->compressedblks);
+ m->map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
err_bonus_cblkcnt:
erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
@@ -409,7 +369,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
- do {
+ while (1) {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
@@ -421,14 +381,16 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return err;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- DBG_BUGON(!m->delta[1] &&
- m->clusterofs != 1 << lclusterbits);
+ /* work around invalid d1 generated by pre-1.0 mkfs */
+ if (unlikely(!m->delta[1])) {
+ m->delta[1] = 1;
+ DBG_BUGON(1);
+ }
} else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- /* go on until the next HEAD lcluster */
if (lcn != headlcn)
- break;
+ break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
} else {
erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
@@ -437,8 +399,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return -EOPNOTSUPP;
}
lcn += m->delta[1];
- } while (m->delta[1]);
-
+ }
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}
@@ -561,7 +522,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
(map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
- map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) &&
map->m_llen >= i_blocksize(inode))) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
@@ -580,7 +542,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
int err, headnr;
erofs_off_t pos;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
struct z_erofs_map_header *h;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
@@ -600,13 +561,12 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
+ if (IS_ERR(h)) {
+ err = PTR_ERR(h);
goto out_unlock;
}
- h = kaddr + erofs_blkoff(sb, pos);
/*
* if the highest bit of the 8-byte map header is set, the whole file
* is stored in the packed inode. The rest bits keeps z_fragmentoff.
@@ -698,32 +658,32 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
struct erofs_inode *const vi = EROFS_I(inode);
int err = 0;
- trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
-
- /* when trying to read beyond EOF, leave it unmapped */
- if (map->m_la >= inode->i_size) {
+ trace_erofs_map_blocks_enter(inode, map, flags);
+ if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */
map->m_llen = map->m_la + 1 - inode->i_size;
map->m_la = inode->i_size;
map->m_flags = 0;
- goto out;
- }
-
- err = z_erofs_fill_inode_lazy(inode);
- if (err)
- goto out;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
- EROFS_MAP_FRAGMENT;
- goto out;
+ } else {
+ err = z_erofs_fill_inode_lazy(inode);
+ if (!err) {
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+ } else {
+ err = z_erofs_do_map_blocks(inode, map, flags);
+ }
+ }
+ if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
+ unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ err = -EOPNOTSUPP;
+ if (err)
+ map->m_llen = 0;
}
-
- err = z_erofs_do_map_blocks(inode, map, flags);
-out:
- trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+ trace_erofs_map_blocks_exit(inode, map, flags, err);
return err;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
new file mode 100644
index 000000000000..55ff2ab5128e
--- /dev/null
+++ b/fs/erofs/zutil.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
+ */
+#include "internal.h"
+
+struct z_erofs_gbuf {
+ spinlock_t lock;
+ void *ptr;
+ struct page **pages;
+ unsigned int nrpages;
+};
+
+static struct z_erofs_gbuf *z_erofs_gbufpool, *z_erofs_rsvbuf;
+static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
+ z_erofs_rsv_nrpages;
+
+module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
+module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
+
+atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
+
+/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+static unsigned int shrinker_run_no;
+static struct shrinker *erofs_shrinker_info;
+
+static unsigned int z_erofs_gbuf_id(void)
+{
+ return raw_smp_processor_id() % z_erofs_gbuf_count;
+}
+
+void *z_erofs_get_gbuf(unsigned int requiredpages)
+ __acquires(gbuf->lock)
+{
+ struct z_erofs_gbuf *gbuf;
+
+ migrate_disable();
+ gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+ spin_lock(&gbuf->lock);
+ /* check if the buffer is too small */
+ if (requiredpages > gbuf->nrpages) {
+ spin_unlock(&gbuf->lock);
+ migrate_enable();
+ /* (for sparse checker) pretend gbuf->lock is still taken */
+ __acquire(gbuf->lock);
+ return NULL;
+ }
+ return gbuf->ptr;
+}
+
+void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
+{
+ struct z_erofs_gbuf *gbuf;
+
+ gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+ DBG_BUGON(gbuf->ptr != ptr);
+ spin_unlock(&gbuf->lock);
+ migrate_enable();
+}
+
+int z_erofs_gbuf_growsize(unsigned int nrpages)
+{
+ static DEFINE_MUTEX(gbuf_resize_mutex);
+ struct page **tmp_pages = NULL;
+ struct z_erofs_gbuf *gbuf;
+ void *ptr, *old_ptr;
+ int last, i, j;
+
+ mutex_lock(&gbuf_resize_mutex);
+ /* avoid shrinking gbufs, since no idea how many fses rely on */
+ if (nrpages <= z_erofs_gbuf_nrpages) {
+ mutex_unlock(&gbuf_resize_mutex);
+ return 0;
+ }
+
+ for (i = 0; i < z_erofs_gbuf_count; ++i) {
+ gbuf = &z_erofs_gbufpool[i];
+ tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL);
+ if (!tmp_pages)
+ goto out;
+
+ for (j = 0; j < gbuf->nrpages; ++j)
+ tmp_pages[j] = gbuf->pages[j];
+ do {
+ last = j;
+ j = alloc_pages_bulk(GFP_KERNEL, nrpages,
+ tmp_pages);
+ if (last == j)
+ goto out;
+ } while (j != nrpages);
+
+ ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
+ if (!ptr)
+ goto out;
+
+ spin_lock(&gbuf->lock);
+ kfree(gbuf->pages);
+ gbuf->pages = tmp_pages;
+ old_ptr = gbuf->ptr;
+ gbuf->ptr = ptr;
+ gbuf->nrpages = nrpages;
+ spin_unlock(&gbuf->lock);
+ if (old_ptr)
+ vunmap(old_ptr);
+ }
+ z_erofs_gbuf_nrpages = nrpages;
+out:
+ if (i < z_erofs_gbuf_count && tmp_pages) {
+ for (j = 0; j < nrpages; ++j)
+ if (tmp_pages[j] && (j >= gbuf->nrpages ||
+ tmp_pages[j] != gbuf->pages[j]))
+ __free_page(tmp_pages[j]);
+ kfree(tmp_pages);
+ }
+ mutex_unlock(&gbuf_resize_mutex);
+ return i < z_erofs_gbuf_count ? -ENOMEM : 0;
+}
+
+int __init z_erofs_gbuf_init(void)
+{
+ unsigned int i, total = num_possible_cpus();
+
+ if (z_erofs_gbuf_count)
+ total = min(z_erofs_gbuf_count, total);
+ z_erofs_gbuf_count = total;
+
+ /* The last (special) global buffer is the reserved buffer */
+ total += !!z_erofs_rsv_nrpages;
+
+ z_erofs_gbufpool = kcalloc(total, sizeof(*z_erofs_gbufpool),
+ GFP_KERNEL);
+ if (!z_erofs_gbufpool)
+ return -ENOMEM;
+
+ if (z_erofs_rsv_nrpages) {
+ z_erofs_rsvbuf = &z_erofs_gbufpool[total - 1];
+ z_erofs_rsvbuf->pages = kcalloc(z_erofs_rsv_nrpages,
+ sizeof(*z_erofs_rsvbuf->pages), GFP_KERNEL);
+ if (!z_erofs_rsvbuf->pages) {
+ z_erofs_rsvbuf = NULL;
+ z_erofs_rsv_nrpages = 0;
+ }
+ }
+ for (i = 0; i < total; ++i)
+ spin_lock_init(&z_erofs_gbufpool[i].lock);
+ return 0;
+}
+
+void z_erofs_gbuf_exit(void)
+{
+ int i, j;
+
+ for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
+ struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
+
+ if (gbuf->ptr) {
+ vunmap(gbuf->ptr);
+ gbuf->ptr = NULL;
+ }
+
+ if (!gbuf->pages)
+ continue;
+
+ for (j = 0; j < gbuf->nrpages; ++j)
+ if (gbuf->pages[j])
+ put_page(gbuf->pages[j]);
+ kfree(gbuf->pages);
+ gbuf->pages = NULL;
+ }
+ kfree(z_erofs_gbufpool);
+}
+
+struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv)
+{
+ struct page *page = *pagepool;
+
+ if (page) {
+ *pagepool = (struct page *)page_private(page);
+ } else if (tryrsv && z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages) {
+ spin_lock(&z_erofs_rsvbuf->lock);
+ if (z_erofs_rsvbuf->nrpages)
+ page = z_erofs_rsvbuf->pages[--z_erofs_rsvbuf->nrpages];
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ }
+ if (!page)
+ page = alloc_page(gfp);
+ DBG_BUGON(page && page_ref_count(page) != 1);
+ return page;
+}
+
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ /* try to fill reserved global pool first */
+ if (z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages <
+ z_erofs_rsv_nrpages) {
+ spin_lock(&z_erofs_rsvbuf->lock);
+ if (z_erofs_rsvbuf->nrpages < z_erofs_rsv_nrpages) {
+ z_erofs_rsvbuf->pages[z_erofs_rsvbuf->nrpages++]
+ = page;
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ continue;
+ }
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ }
+ put_page(page);
+ }
+}
+
+void erofs_shrinker_register(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_init(&sbi->umount_mutex);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_add(&sbi->list, &erofs_sb_list);
+ spin_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ mutex_lock(&sbi->umount_mutex);
+ while (!xa_empty(&sbi->managed_pslots)) {
+ z_erofs_shrink_scan(sbi, ~0UL);
+ cond_resched();
+ }
+ spin_lock(&erofs_sb_list_lock);
+ list_del(&sbi->list);
+ spin_unlock(&erofs_sb_list_lock);
+ mutex_unlock(&sbi->umount_mutex);
+}
+
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return atomic_long_read(&erofs_global_shrink_cnt) ?: SHRINK_EMPTY;
+}
+
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct erofs_sb_info *sbi;
+ struct list_head *p;
+
+ unsigned long nr = sc->nr_to_scan;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&erofs_sb_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+
+ /* Iterate over all mounted superblocks and try to shrink them */
+ p = erofs_sb_list.next;
+ while (p != &erofs_sb_list) {
+ sbi = list_entry(p, struct erofs_sb_info, list);
+
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+
+ spin_unlock(&erofs_sb_list_lock);
+ sbi->shrinker_run_no = run_no;
+ freed += z_erofs_shrink_scan(sbi, nr - freed);
+ spin_lock(&erofs_sb_list_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_move_tail(&sbi->list, &erofs_sb_list);
+ mutex_unlock(&sbi->umount_mutex);
+
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&erofs_sb_list_lock);
+ return freed;
+}
+
+int __init erofs_init_shrinker(void)
+{
+ erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
+ if (!erofs_shrinker_info)
+ return -ENOMEM;
+
+ erofs_shrinker_info->count_objects = erofs_shrink_count;
+ erofs_shrinker_info->scan_objects = erofs_shrink_scan;
+ shrinker_register(erofs_shrinker_info);
+ return 0;
+}
+
+void erofs_exit_shrinker(void)
+{
+ shrinker_free(erofs_shrinker_info);
+}