summaryrefslogtreecommitdiff
path: root/fs/erofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig124
-rw-r--r--fs/erofs/Makefile8
-rw-r--r--fs/erofs/compress.h103
-rw-r--r--fs/erofs/data.c444
-rw-r--r--fs/erofs/decompressor.c498
-rw-r--r--fs/erofs/decompressor_crypto.c182
-rw-r--r--fs/erofs/decompressor_deflate.c204
-rw-r--r--fs/erofs/decompressor_lzma.c170
-rw-r--r--fs/erofs/decompressor_zstd.c220
-rw-r--r--fs/erofs/dir.c99
-rw-r--r--fs/erofs/erofs_fs.h380
-rw-r--r--fs/erofs/fileio.c191
-rw-r--r--fs/erofs/fscache.c464
-rw-r--r--fs/erofs/inode.c449
-rw-r--r--fs/erofs/internal.h479
-rw-r--r--fs/erofs/namei.c71
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c868
-rw-r--r--fs/erofs/sysfs.c126
-rw-r--r--fs/erofs/tagptr.h107
-rw-r--r--fs/erofs/utils.c292
-rw-r--r--fs/erofs/xattr.c760
-rw-r--r--fs/erofs/xattr.h54
-rw-r--r--fs/erofs/zdata.c1922
-rw-r--r--fs/erofs/zdata.h178
-rw-r--r--fs/erofs/zmap.c931
-rw-r--r--fs/erofs/zutil.c317
27 files changed, 5225 insertions, 4564 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 85490370e0ca..d81f3318417d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,18 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CACHEFILES if EROFS_FS_ONDEMAND
+ select CRC32
+ select CRYPTO if EROFS_FS_ZIP_ACCEL
+ select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL
select FS_IOMAP
- select LIBCRC32C
+ select LZ4_DECOMPRESS if EROFS_FS_ZIP
+ select NETFS_SUPPORT if EROFS_FS_ONDEMAND
+ select XXHASH if EROFS_FS_XATTR
+ select XZ_DEC if EROFS_FS_ZIP_LZMA
+ select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA
+ select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE
+ select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
@@ -13,15 +23,15 @@ config EROFS_FS
smartphones with Android OS, LiveCDs and high-density hosts with
numerous containers;
- It also provides fixed-sized output compression support in order to
- improve storage density as well as keep relatively higher compression
- ratios and implements in-place decompression to reuse the file page
- for compressed data temporarily with proper strategies, which is
- quite useful to ensure guaranteed end-to-end runtime decompression
- performance under extremely memory pressure without extra cost.
+ It also provides transparent compression and deduplication support to
+ improve storage density and maintain relatively high compression
+ ratios, and it implements in-place decompression to temporarily reuse
+ page cache for compressed data using proper strategies, which is
+ quite useful for ensuring guaranteed end-to-end runtime decompression
+ performance under extreme memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
- for more details.
+ and the web pages at <https://erofs.docs.kernel.org> for more details.
If unsure, say N.
@@ -73,38 +83,114 @@ config EROFS_FS_SECURITY
If you are not using a security module, say N.
+config EROFS_FS_BACKED_BY_FILE
+ bool "File-backed EROFS filesystem support"
+ depends on EROFS_FS
+ default y
+ help
+ This allows EROFS to use filesystem image files directly, without
+ the intercession of loopback block devices or likewise. It is
+ particularly useful for container images with numerous blobs and
+ other sandboxes, where loop devices behave intricately. It can also
+ be used to simplify error-prone lifetime management of unnecessary
+ virtual block devices.
+
+ Note that this feature, along with ongoing fanotify pre-content
+ hooks, will eventually replace "EROFS over fscache."
+
+ If you don't want to enable this feature, say N.
+
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
- select LZ4_DECOMPRESS
default y
help
- Enable fixed-sized output compression for EROFS.
+ Enable transparent compression support for EROFS file systems.
If you don't want to enable compression feature, say N.
config EROFS_FS_ZIP_LZMA
bool "EROFS LZMA compressed data support"
depends on EROFS_FS_ZIP
- select XZ_DEC
- select XZ_DEC_MICROLZMA
help
Saying Y here includes support for reading EROFS file systems
- containing LZMA compressed data, specifically called microLZMA. it
- gives better compression ratios than the LZ4 algorithm, at the
+ containing LZMA compressed data, specifically called microLZMA. It
+ gives better compression ratios than the default LZ4 format, at the
expense of more CPU overhead.
- LZMA support is an experimental feature for now and so most file
- systems will be readable without selecting this option.
+ If unsure, say N.
+
+config EROFS_FS_ZIP_DEFLATE
+ bool "EROFS DEFLATE compressed data support"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing DEFLATE compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ DEFLATE support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
+config EROFS_FS_ZIP_ZSTD
+ bool "EROFS Zstandard compressed data support"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing Zstandard compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ Zstandard support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
+config EROFS_FS_ZIP_ACCEL
+ bool "EROFS hardware decompression support"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here includes hardware accelerator support for reading
+ EROFS file systems containing compressed data. It gives better
+ decompression speed than the software-implemented decompression, and
+ it costs lower CPU overhead.
+
+ Hardware accelerator support is an experimental feature for now and
+ file systems are still readable without selecting this option.
If unsure, say N.
config EROFS_FS_ONDEMAND
- bool "EROFS fscache-based on-demand read support"
- depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
- default n
+ bool "EROFS fscache-based on-demand read support (deprecated)"
+ depends on EROFS_FS
+ select FSCACHE
+ select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
+ It is now deprecated and scheduled to be removed from the kernel
+ after fanotify pre-content hooks are landed.
+
+ If unsure, say N.
+
+config EROFS_FS_PCPU_KTHREAD
+ bool "EROFS per-cpu decompression kthread workers"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here enables per-CPU kthread workers pool to carry out
+ async decompression for low latencies on some architectures.
+
+ If unsure, say N.
+
+config EROFS_FS_PCPU_KTHREAD_HIPRI
+ bool "EROFS high priority per-CPU kthread workers"
+ depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD
+ default y
+ help
+ This permits EROFS to configure per-CPU kthread workers to run
+ at higher priority.
+
If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 99bbc597a3e9..549abc424763 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,8 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
+erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 26fa170090b8..84c8e52581f4 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,57 +11,37 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
-
+ unsigned int inpages, outpages;
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
- /* indicate the algorithm will be used for decompression */
- unsigned int alg;
+ unsigned int alg; /* the algorithm for decompression */
bool inplace_io, partial_decoding, fillgaps;
+ gfp_t gfp; /* allocation flags for extra temporary buffers */
};
struct z_erofs_decompressor {
- int (*decompress)(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+ int (*config)(struct super_block *sb, struct erofs_super_block *dsb,
+ void *data, int size);
+ const char *(*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ int (*init)(void);
+ void (*exit)(void);
char *name;
};
-/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
-#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
-
-/*
- * For all pages in a pcluster, page->private should be one of
- * Type Last 2bits page->private
- * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE
- * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE
- * cached/managed page 00 pointer to z_erofs_pcluster
- * online page (file-backed, 01/10/11 sub-index << 2 | count
- * some pages can be used for inplace I/O)
- *
- * page->mapping should be one of
- * Type page->mapping
- * short-lived page NULL
- * preallocated page NULL
- * cached/managed page non-NULL or NULL (invalidated/truncated page)
- * online page non-NULL
- *
- * For all managed pages, PG_private should be set with 1 extra refcount,
- * which is used for page reclaim / migration.
- */
+#define Z_EROFS_PREALLOCATED_FOLIO ((void *)(-2UL << 2))
/*
- * short-lived pages are pages directly from buddy system with specific
- * page->private (no need to set PagePrivate since these are non-LRU /
- * non-movable pages and bypass reclaim / migration code).
+ * Currently, short-lived pages are pages directly from buddy system
+ * with specific page->private (Z_EROFS_SHORTLIVED_PAGE).
+ * In the future world of Memdescs, it should be type 0 (Misc) memory
+ * which type can be checked with a new helper.
*/
static inline bool z_erofs_is_shortlived_page(struct page *page)
{
- if (page->private != Z_EROFS_SHORTLIVED_PAGE)
- return false;
-
- DBG_BUGON(page->mapping);
- return true;
+ return page->private == Z_EROFS_SHORTLIVED_PAGE;
}
static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
@@ -69,30 +49,41 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
{
if (!z_erofs_is_shortlived_page(page))
return false;
-
- /* short-lived pages should not be used by others at the same time */
- if (page_ref_count(page) > 1) {
- put_page(page);
- } else {
- /* follow the pcluster rule above. */
- erofs_pagepool_add(pagepool, page);
- }
+ erofs_pagepool_add(pagepool, page);
return true;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
+extern const struct z_erofs_decompressor z_erofs_lzma_decomp;
+extern const struct z_erofs_decompressor z_erofs_deflate_decomp;
+extern const struct z_erofs_decompressor z_erofs_zstd_decomp;
+extern const struct z_erofs_decompressor *z_erofs_decomp[];
-int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
- unsigned int padbufsize);
-int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+struct z_erofs_stream_dctx {
+ struct z_erofs_decompress_req *rq;
+ int no, ni; /* the current {en,de}coded page # */
-/* prototypes for specific algorithms */
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
+ unsigned int avail_out; /* remaining bytes in the decoded buffer */
+ unsigned int inbuf_pos, inbuf_sz;
+ /* current status of the encoded buffer */
+ u8 *kin, *kout; /* buffer mapped pointers */
+ void *bounce; /* bounce buffer for inplace I/Os */
+ bool bounced; /* is the bounce buffer used now? */
+};
+
+const char *z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx,
+ void **dst, void **src, struct page **pgpl);
+const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq,
+ const char *padbuf, unsigned int padbufsize);
+int __init z_erofs_init_decompressor(void);
+void z_erofs_exit_decompressor(void);
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl);
+int z_erofs_crypto_enable_engine(const char *name, int len);
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+void z_erofs_crypto_disable_all_engines(void);
+int z_erofs_crypto_show_engines(char *buf, int size, char sep);
+#else
+static inline void z_erofs_crypto_disable_all_engines(void) {}
+static inline int z_erofs_crypto_show_engines(char *buf, int size, char sep) { return 0; }
+#endif
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index f57f921683d7..bb13c4cb8455 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,17 +5,15 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
-#include <linux/prefetch.h>
#include <linux/sched/mm.h>
-#include <linux/dax.h>
#include <trace/events/erofs.h>
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- if (buf->kmap_type == EROFS_KMAP)
- kunmap_local(buf->base);
+ if (!buf->base)
+ return;
+ kunmap_local(buf->base);
buf->base = NULL;
- buf->kmap_type = EROFS_NO_KMAP;
}
void erofs_put_metabuf(struct erofs_buf *buf)
@@ -23,124 +21,103 @@ void erofs_put_metabuf(struct erofs_buf *buf)
if (!buf->page)
return;
erofs_unmap_metabuf(buf);
- put_page(buf->page);
+ folio_put(page_folio(buf->page));
buf->page = NULL;
}
-void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
- struct address_space *const mapping = inode->i_mapping;
- erofs_off_t offset = blknr_to_addr(blkaddr);
- pgoff_t index = offset >> PAGE_SHIFT;
- struct page *page = buf->page;
- struct folio *folio;
- unsigned int nofs_flag;
-
- if (!page || page->index != index) {
- erofs_put_metabuf(buf);
+ pgoff_t index = (buf->off + offset) >> PAGE_SHIFT;
+ struct folio *folio = NULL;
- nofs_flag = memalloc_nofs_save();
- folio = read_cache_folio(mapping, index, NULL, NULL);
- memalloc_nofs_restore(nofs_flag);
+ if (buf->page) {
+ folio = page_folio(buf->page);
+ if (folio_file_page(folio, index) != buf->page)
+ erofs_unmap_metabuf(buf);
+ }
+ if (!folio || !folio_contains(folio, index)) {
+ erofs_put_metabuf(buf);
+ folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
-
- /* should already be PageUptodate, no need to lock page */
- page = folio_file_page(folio, index);
- buf->page = page;
- }
- if (buf->kmap_type == EROFS_NO_KMAP) {
- if (type == EROFS_KMAP)
- buf->base = kmap_local_page(page);
- buf->kmap_type = type;
- } else if (buf->kmap_type != type) {
- DBG_BUGON(1);
- return ERR_PTR(-EFAULT);
}
- if (type == EROFS_NO_KMAP)
+ buf->page = folio_file_page(folio, index);
+ if (!need_kmap)
return NULL;
+ if (!buf->base)
+ buf->base = kmap_local_page(buf->page);
return buf->base + (offset & ~PAGE_MASK);
}
-void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox)
{
- if (erofs_is_fscache_mode(sb))
- return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
- blkaddr, type);
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
+ buf->file = NULL;
+ if (in_metabox) {
+ if (unlikely(!sbi->metabox_inode))
+ return -EFSCORRUPTED;
+ buf->mapping = sbi->metabox_inode->i_mapping;
+ return 0;
+ }
+ buf->off = sbi->dif0.fsoff;
+ if (erofs_is_fileio_mode(sbi)) {
+ buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
+ buf->mapping = buf->file->f_mapping;
+ } else if (erofs_is_fscache_mode(sb))
+ buf->mapping = sbi->dif0.fscache->inode->i_mapping;
+ else
+ buf->mapping = sb->s_bdev->bd_mapping;
+ return 0;
}
-static int erofs_map_blocks_flatmode(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags)
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_off_t offset, bool in_metabox)
{
- erofs_blk_t nblocks, lastblk;
- u64 offset = map->m_la;
- struct erofs_inode *vi = EROFS_I(inode);
- bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
-
- nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
- lastblk = nblocks - tailendpacking;
-
- /* there is no hole in flatmode */
- map->m_flags = EROFS_MAP_MAPPED;
- if (offset < blknr_to_addr(lastblk)) {
- map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
- map->m_plen = blknr_to_addr(lastblk) - offset;
- } else if (tailendpacking) {
- /* 2 - inode inline B: inode, [xattrs], inline last blk... */
- struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
-
- map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(map->m_la);
- map->m_plen = inode->i_size - offset;
+ int err;
- /* inline data should be located in the same meta block */
- if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) {
- erofs_err(inode->i_sb,
- "inline data cross block boundary @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- map->m_flags |= EROFS_MAP_META;
- } else {
- erofs_err(inode->i_sb,
- "internal error @ nid: %llu (size %llu), m_la 0x%llx",
- vi->nid, inode->i_size, map->m_la);
- DBG_BUGON(1);
- return -EIO;
- }
- return 0;
+ err = erofs_init_metabuf(buf, sb, in_metabox);
+ if (err)
+ return ERR_PTR(err);
+ return erofs_bread(buf, offset, true);
}
-int erofs_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map, int flags)
+int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
{
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = inode->i_sb;
+ unsigned int unit, blksz = sb->s_blocksize;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u64 chunknr;
- unsigned int unit;
+ erofs_blk_t startblk, addrmask;
+ bool tailpacking;
erofs_off_t pos;
- void *kaddr;
+ u64 chunknr;
int err = 0;
- trace_erofs_map_blocks_enter(inode, map, flags);
+ trace_erofs_map_blocks_enter(inode, map, 0);
map->m_deviceid = 0;
- if (map->m_la >= inode->i_size) {
- /* leave out-of-bound access unmapped */
- map->m_flags = 0;
- map->m_plen = 0;
+ map->m_flags = 0;
+ if (map->m_la >= inode->i_size)
goto out;
- }
if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
- err = erofs_map_blocks_flatmode(inode, map, flags);
+ tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ if (!tailpacking && vi->startblk == EROFS_NULL_ADDR)
+ goto out;
+ pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking);
+
+ map->m_flags = EROFS_MAP_MAPPED;
+ if (map->m_la < pos) {
+ map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
+ map->m_llen = pos - map->m_la;
+ } else {
+ map->m_pa = erofs_iloc(inode) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_llen = inode->i_size - map->m_la;
+ map->m_flags |= EROFS_MAP_META;
+ }
goto out;
}
@@ -150,64 +127,70 @@ int erofs_map_blocks(struct inode *inode,
unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
chunknr = map->m_la >> vi->chunkbits;
- pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+ pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode));
+ if (IS_ERR(idx)) {
+ err = PTR_ERR(idx);
goto out;
}
map->m_la = chunknr << vi->chunkbits;
- map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
- roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
-
- /* handle block map */
- if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr + erofs_blkoff(pos);
-
- if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
- map->m_flags = 0;
- } else {
- map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
+ map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ round_up(inode->i_size - map->m_la, blksz));
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
+ addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ?
+ BIT_ULL(48) - 1 : BIT_ULL(32) - 1;
+ startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) |
+ le32_to_cpu(idx->startblk_lo)) & addrmask;
+ if ((startblk ^ EROFS_NULL_ADDR) & addrmask) {
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = erofs_pos(sb, startblk);
+ map->m_flags = EROFS_MAP_MAPPED;
+ }
+ } else {
+ startblk = le32_to_cpu(*(__le32 *)idx);
+ if (startblk != (u32)EROFS_NULL_ADDR) {
+ map->m_pa = erofs_pos(sb, startblk);
map->m_flags = EROFS_MAP_MAPPED;
}
- goto out_unlock;
- }
- /* parse chunk indexes */
- idx = kaddr + erofs_blkoff(pos);
- switch (le32_to_cpu(idx->blkaddr)) {
- case EROFS_NULL_ADDR:
- map->m_flags = 0;
- break;
- default:
- map->m_deviceid = le16_to_cpu(idx->device_id) &
- EROFS_SB(sb)->device_id_mask;
- map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
- map->m_flags = EROFS_MAP_MAPPED;
- break;
}
-out_unlock:
erofs_put_metabuf(&buf);
out:
- if (!err)
- map->m_llen = map->m_plen;
- trace_erofs_map_blocks_exit(inode, map, flags, 0);
+ if (!err) {
+ map->m_plen = map->m_llen;
+ /* inline data should be located in the same meta block */
+ if ((map->m_flags & EROFS_MAP_META) &&
+ erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) {
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ }
+ trace_erofs_map_blocks_exit(inode, map, 0, err);
return err;
}
+static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
+ struct super_block *sb, struct erofs_device_info *dif)
+{
+ map->m_sb = sb;
+ map->m_dif = dif;
+ map->m_bdev = NULL;
+ if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
+ map->m_bdev = file_bdev(dif->file);
+}
+
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
+ erofs_off_t startoff;
int id;
- /* primary device by default */
- map->m_bdev = sb->s_bdev;
- map->m_daxdev = EROFS_SB(sb)->dax_dev;
- map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
- map->m_fscache = EROFS_SB(sb)->s_fscache;
-
+ erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
+ map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */
if (map->m_deviceid) {
down_read(&devs->rwsem);
dif = idr_find(&devs->tree, map->m_deviceid - 1);
@@ -215,28 +198,24 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return -ENODEV;
}
- map->m_bdev = dif->bdev;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ if (devs->flatdev) {
+ map->m_pa += erofs_pos(sb, dif->uniaddr);
+ up_read(&devs->rwsem);
+ return 0;
+ }
+ erofs_fill_from_devinfo(map, sb, dif);
up_read(&devs->rwsem);
- } else if (devs->extra_devices) {
+ } else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- erofs_off_t startoff, length;
-
- if (!dif->mapped_blkaddr)
+ if (!dif->uniaddr)
continue;
- startoff = blknr_to_addr(dif->mapped_blkaddr);
- length = blknr_to_addr(dif->blocks);
+ startoff = erofs_pos(sb, dif->uniaddr);
if (map->m_pa >= startoff &&
- map->m_pa < startoff + length) {
+ map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, sb, dif);
break;
}
}
@@ -245,61 +224,103 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return 0;
}
+/*
+ * bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
+ * bit 0 - 29: remaining parts to complete this folio
+ */
+#define EROFS_ONLINEFOLIO_EIO 30
+#define EROFS_ONLINEFOLIO_DIRTY 29
+
+void erofs_onlinefolio_init(struct folio *folio)
+{
+ union {
+ atomic_t o;
+ void *v;
+ } u = { .o = ATOMIC_INIT(1) };
+
+ folio->private = u.v; /* valid only if file-backed folio is locked */
+}
+
+void erofs_onlinefolio_split(struct folio *folio)
+{
+ atomic_inc((atomic_t *)&folio->private);
+}
+
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
+{
+ int orig, v;
+
+ do {
+ orig = atomic_read((atomic_t *)&folio->private);
+ DBG_BUGON(orig <= 0);
+ v = dirty << EROFS_ONLINEFOLIO_DIRTY;
+ v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
+ } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
+
+ if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
+ return;
+ folio->private = 0;
+ if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
+ flush_dcache_folio(folio);
+ folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
+ struct super_block *sb = inode->i_sb;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
map.m_la = offset;
map.m_llen = length;
-
- ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ ret = erofs_map_blocks(inode, &map);
if (ret < 0)
return ret;
- mdev = (struct erofs_map_dev) {
- .m_deviceid = map.m_deviceid,
- .m_pa = map.m_pa,
- };
- ret = erofs_map_dev(inode->i_sb, &mdev);
- if (ret)
- return ret;
-
iomap->offset = map.m_la;
- if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_daxdev;
- else
- iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
-
+ iomap->addr = IOMAP_NULL_ADDR;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
iomap->type = IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
- if (!iomap->length)
- iomap->length = length;
return 0;
}
+ if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) {
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_dif->dax_dev;
+ else
+ iomap->bdev = mdev.m_bdev;
+ iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dif->dax_part_off;
+ }
+
if (map.m_flags & EROFS_MAP_META) {
void *ptr;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, inode->i_sb,
- erofs_blknr(mdev.m_pa), EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(ptr))
return PTR_ERR(ptr);
- iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa);
+ iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = mdev.m_pa;
- if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dax_part_off;
}
return 0;
}
@@ -313,7 +334,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
struct erofs_buf buf = {
.page = kmap_to_page(ptr),
.base = ptr,
- .kmap_type = EROFS_KMAP,
};
DBG_BUGON(iomap->type != IOMAP_INLINE);
@@ -349,12 +369,18 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ trace_erofs_read_folio(folio, true);
+
+ iomap_bio_read_folio(folio, &erofs_iomap_ops);
+ return 0;
}
static void erofs_readahead(struct readahead_control *rac)
{
- return iomap_readahead(rac, &erofs_iomap_ops);
+ trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
+ readahead_count(rac), true);
+
+ iomap_bio_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
@@ -374,27 +400,14 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = bdev_logical_block_size(bdev) - 1;
- else
- blksize_mask = (1 << inode->i_blkbits) - 1;
-
- if ((iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to)) & blksize_mask)
- return -EINVAL;
-
+ if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, NULL, 0);
- }
return filemap_read(iocb, to, 0);
}
/* for uncompressed (aligned) files and raw access for other files */
-const struct address_space_operations erofs_raw_access_aops = {
+const struct address_space_operations erofs_aops = {
.read_folio = erofs_read_folio,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
@@ -405,14 +418,14 @@ const struct address_space_operations erofs_raw_access_aops = {
#ifdef CONFIG_FS_DAX
static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+ unsigned int order)
{
- return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+ return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops);
}
static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
- return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return erofs_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct erofs_dax_vm_ops = {
@@ -420,25 +433,54 @@ static const struct vm_operations_struct erofs_dax_vm_ops = {
.huge_fault = erofs_dax_huge_fault,
};
-static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
{
- if (!IS_DAX(file_inode(file)))
- return generic_file_readonly_mmap(file, vma);
+ if (!IS_DAX(file_inode(desc->file)))
+ return generic_file_readonly_mmap_prepare(desc);
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
return -EINVAL;
- vma->vm_ops = &erofs_dax_vm_ops;
- vma->vm_flags |= VM_HUGEPAGE;
+ desc->vm_ops = &erofs_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
#else
-#define erofs_file_mmap generic_file_readonly_mmap
+#define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare
+#endif
+
+static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ const struct iomap_ops *ops = &erofs_iomap_ops;
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+#ifdef CONFIG_EROFS_FS_ZIP
+ ops = &z_erofs_iomap_report_ops;
+#else
+ return generic_file_llseek(file, offset, whence);
#endif
+ if (whence == SEEK_HOLE)
+ offset = iomap_seek_hole(inode, offset, ops);
+ else if (whence == SEEK_DATA)
+ offset = iomap_seek_data(inode, offset, ops);
+ else
+ return generic_file_llseek(file, offset, whence);
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct file_operations erofs_file_fops = {
- .llseek = generic_file_llseek,
+ .llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
- .mmap = erofs_file_mmap,
- .splice_read = generic_file_splice_read,
+ .unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = erofs_compat_ioctl,
+#endif
+ .mmap_prepare = erofs_file_mmap_prepare,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 51b7ac7166d9..d5d090276391 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -2,33 +2,18 @@
/*
* Copyright (C) 2019 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "compress.h"
-#include <linux/module.h>
#include <linux/lz4.h>
-#ifndef LZ4_DISTANCE_MAX /* history window size */
-#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
-#endif
-
#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
-#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
-#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
-#endif
-
-struct z_erofs_lz4_decompress_ctx {
- struct z_erofs_decompress_req *rq;
- /* # of encoded, decoded pages */
- unsigned int inpages, outpages;
- /* decoded block total length (used for in-place decompression) */
- unsigned int oend;
-};
-int z_erofs_load_lz4_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lz4_cfgs *lz4, int size)
+static int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct z_erofs_lz4_cfgs *lz4 = data;
u16 distance;
if (lz4) {
@@ -42,7 +27,7 @@ int z_erofs_load_lz4_config(struct super_block *sb,
if (!sbi->lz4.max_pclusterblks) {
sbi->lz4.max_pclusterblks = 1; /* reserved case */
} else if (sbi->lz4.max_pclusterblks >
- Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+ erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) {
erofs_err(sb, "too large lz4 pclusterblks %u",
sbi->lz4.max_pclusterblks);
return -EINVAL;
@@ -55,17 +40,16 @@ int z_erofs_load_lz4_config(struct super_block *sb,
sbi->lz4.max_distance_pages = distance ?
DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
LZ4_MAX_DISTANCE_PAGES;
- return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+ return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks);
}
/*
* Fill all gaps with bounce pages if it's a sparse page list. Also check if
* all physical pages are consecutive, which can be seen for moderate CR.
*/
-static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
@@ -75,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < ctx->outpages; ++i, ++j) {
+ for (i = j = 0; i < rq->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
@@ -110,10 +94,10 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
if (top) {
victim = availables[--top];
- get_page(victim);
} else {
- victim = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ victim = __erofs_allocpage(pagepool, rq->gfp, true);
+ if (!victim)
+ return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
@@ -121,65 +105,72 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
return kaddr ? 1 : 0;
}
-static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
- void *inpage, unsigned int *inputmargin, int *maptype,
- bool may_inplace)
+static void *z_erofs_lz4_handle_overlap(const struct z_erofs_decompress_req *rq,
+ void *inpage, void *out, unsigned int *inputmargin,
+ int *maptype, bool may_inplace)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
- unsigned int omargin, total, i, j;
+ unsigned int oend, omargin, cnt, i;
struct page **in;
- void *src, *tmp;
-
- if (rq->inplace_io) {
- omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
- if (rq->partial_decoding || !may_inplace ||
- omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
- goto docopy;
-
- for (i = 0; i < ctx->inpages; ++i) {
- DBG_BUGON(rq->in[i] == NULL);
- for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j)
- if (rq->out[j] == rq->in[i])
- goto docopy;
+ void *src;
+
+ /*
+ * If in-place I/O isn't used, for example, the bounce compressed cache
+ * can hold data for incomplete read requests. Just map the compressed
+ * buffer as well and decompress directly.
+ */
+ if (!rq->inplace_io) {
+ if (rq->inpages <= 1) {
+ *maptype = 0;
+ return inpage;
}
+ kunmap_local(inpage);
+ src = erofs_vm_map_ram(rq->in, rq->inpages);
+ if (!src)
+ return ERR_PTR(-ENOMEM);
+ *maptype = 1;
+ return src;
}
-
- if (ctx->inpages <= 1) {
- *maptype = 0;
- return inpage;
+ /*
+ * Then, deal with in-place I/Os. The reasons why in-place I/O is useful
+ * are: (1) It minimizes memory footprint during the I/O submission,
+ * which is useful for slow storage (including network devices and
+ * low-end HDDs/eMMCs) but with a lot inflight I/Os; (2) If in-place
+ * decompression can also be applied, it will reuse the unique buffer so
+ * that no extra CPU D-cache is polluted with temporary compressed data
+ * for extreme performance.
+ */
+ oend = rq->pageofs_out + rq->outputsize;
+ omargin = PAGE_ALIGN(oend) - oend;
+ if (!rq->partial_decoding && may_inplace &&
+ omargin >= LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) {
+ for (i = 0; i < rq->inpages; ++i)
+ if (rq->out[rq->outpages - rq->inpages + i] !=
+ rq->in[i])
+ break;
+ if (i >= rq->inpages) {
+ kunmap_local(inpage);
+ *maptype = 3;
+ return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT);
+ }
}
- kunmap_atomic(inpage);
- might_sleep();
- src = erofs_vm_map_ram(rq->in, ctx->inpages);
- if (!src)
- return ERR_PTR(-ENOMEM);
- *maptype = 1;
- return src;
-
-docopy:
- /* Or copy compressed data which can be overlapped to per-CPU buffer */
- in = rq->in;
- src = erofs_get_pcpubuf(ctx->inpages);
+ /*
+ * If in-place decompression can't be applied, copy compressed data that
+ * may potentially overlap during decompression to a per-CPU buffer.
+ */
+ src = z_erofs_get_gbuf(rq->inpages);
if (!src) {
DBG_BUGON(1);
- kunmap_atomic(inpage);
+ kunmap_local(inpage);
return ERR_PTR(-EFAULT);
}
- tmp = src;
- total = rq->inputsize;
- while (total) {
- unsigned int page_copycnt =
- min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
-
+ for (i = 0, in = rq->in; i < rq->inputsize; i += cnt, ++in) {
+ cnt = min_t(u32, rq->inputsize - i, PAGE_SIZE - *inputmargin);
if (!inpage)
- inpage = kmap_atomic(*in);
- memcpy(tmp, inpage + *inputmargin, page_copycnt);
- kunmap_atomic(inpage);
+ inpage = kmap_local_page(*in);
+ memcpy(src + i, inpage + *inputmargin, cnt);
+ kunmap_local(inpage);
inpage = NULL;
- tmp += page_copycnt;
- total -= page_copycnt;
- ++in;
*inputmargin = 0;
}
*maptype = 2;
@@ -187,55 +178,55 @@ docopy:
}
/*
- * Get the exact inputsize with zero_padding feature.
- * - For LZ4, it should work if zero_padding feature is on (5.3+);
- * - For MicroLZMA, it'd be enabled all the time.
+ * Get the exact on-disk size of the compressed data:
+ * - For LZ4, it should apply if the zero_padding feature is on (5.3+);
+ * - For others, zero_padding is enabled all the time.
*/
-int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
- unsigned int padbufsize)
+const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq,
+ const char *padbuf, unsigned int padbufsize)
{
const char *padend;
padend = memchr_inv(padbuf, 0, padbufsize);
if (!padend)
- return -EFSCORRUPTED;
+ return "compressed data start not found";
rq->inputsize -= padend - padbuf;
rq->pageofs_in += padend - padbuf;
- return 0;
+ return NULL;
}
-static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
- u8 *out)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
- u8 *headpage, *src;
+ u8 *out, *headpage, *src;
+ const char *reason;
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
- headpage = kmap_atomic(*rq->in);
+ headpage = kmap_local_page(*rq->in);
/* LZ4 decompression inplace is only safe if zero_padding is enabled */
if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
support_0padding = true;
- ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
min_t(unsigned int, rq->inputsize,
- EROFS_BLKSIZ - rq->pageofs_in));
- if (ret) {
- kunmap_atomic(headpage);
- return ret;
+ rq->sb->s_blocksize - rq->pageofs_in));
+ if (reason) {
+ kunmap_local(headpage);
+ return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED;
}
may_inplace = !((rq->pageofs_in + rq->inputsize) &
- (EROFS_BLKSIZ - 1));
+ (rq->sb->s_blocksize - 1));
}
inputmargin = rq->pageofs_in;
- src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin,
+ src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin,
&maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
+ out = dst + rq->pageofs_out;
/* legacy format could compress extra data in a pcluster. */
if (rq->partial_decoding || !support_0padding)
ret = LZ4_decompress_safe_partial(src + inputmargin, out,
@@ -245,147 +236,290 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
rq->inputsize, rq->outputsize);
if (ret != rq->outputsize) {
- erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
- ret, rq->inputsize, inputmargin, rq->outputsize);
-
- print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
- 16, 1, src + inputmargin, rq->inputsize, true);
- print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
- 16, 1, out, rq->outputsize, true);
-
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
} else {
ret = 0;
}
if (maptype == 0) {
- kunmap_atomic(headpage);
+ kunmap_local(headpage);
} else if (maptype == 1) {
- vm_unmap_ram(src, ctx->inpages);
+ vm_unmap_ram(src, rq->inpages);
} else if (maptype == 2) {
- erofs_put_pcpubuf(src);
- } else {
+ z_erofs_put_gbuf(src);
+ } else if (maptype != 3) {
DBG_BUGON(1);
return -EFAULT;
}
return ret;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static const char *z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
int ret;
- ctx.rq = rq;
- ctx.oend = rq->pageofs_out + rq->outputsize;
- ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
- ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
-
/* one optimized fast path only for non bigpcluster cases yet */
- if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
+ if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
+ dst = kmap_local_page(*rq->out);
dst_maptype = 0;
- goto dstmap_out;
- }
-
- /* general decoding path which can be used for all cases */
- ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- dst = page_address(*rq->out);
- dst_maptype = 1;
} else {
- dst = erofs_vm_map_ram(rq->out, ctx.outpages);
- if (!dst)
- return -ENOMEM;
- dst_maptype = 2;
+ /* general decoding path which can be used for all cases */
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, rq->outpages);
+ if (!dst)
+ return ERR_PTR(-ENOMEM);
+ dst_maptype = 2;
+ }
}
-
-dstmap_out:
- ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
+ ret = z_erofs_lz4_decompress_mem(rq, dst);
if (!dst_maptype)
- kunmap_atomic(dst);
+ kunmap_local(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, ctx.outpages);
- return ret;
+ vm_unmap_ram(dst, rq->outpages);
+ return ERR_PTR(ret);
}
-static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static const char *z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- const unsigned int outpages =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
- PAGE_SIZE - rq->pageofs_out);
- const unsigned int lefthalf = rq->outputsize - righthalf;
- const unsigned int interlaced_offset =
- rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
- unsigned char *src, *dst;
-
- if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages;
+ const unsigned int bs = rq->sb->s_blocksize;
+ unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
+ u8 *kin;
+
+ if (rq->outputsize > rq->inputsize)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
+ cur = bs - (rq->pageofs_out & (bs - 1));
+ pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
+ cur = min(cur, rq->outputsize);
+ if (cur && rq->out[0]) {
+ kin = kmap_local_page(rq->in[nrpages_in - 1]);
+ if (rq->out[0] == rq->in[nrpages_in - 1])
+ memmove(kin + rq->pageofs_out, kin + pi, cur);
+ else
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ kin + pi, cur);
+ kunmap_local(kin);
+ }
+ rq->outputsize -= cur;
}
- if (rq->out[0] == *rq->in) {
- DBG_BUGON(rq->pageofs_out);
- return 0;
+ for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) {
+ insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize);
+ rq->outputsize -= insz;
+ if (!rq->in[ni])
+ continue;
+ kin = kmap_local_page(rq->in[ni]);
+ pi = 0;
+ do {
+ no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT;
+ po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
+ DBG_BUGON(no >= nrpages_out);
+ cnt = min(insz - pi, PAGE_SIZE - po);
+ if (rq->out[no] == rq->in[ni])
+ memmove(kin + po,
+ kin + rq->pageofs_in + pi, cnt);
+ else if (rq->out[no])
+ memcpy_to_page(rq->out[no], po,
+ kin + rq->pageofs_in + pi, cnt);
+ pi += cnt;
+ } while (pi < insz);
+ kunmap_local(kin);
}
+ DBG_BUGON(ni > nrpages_in);
+ return NULL;
+}
- src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
- if (rq->out[0]) {
- dst = kmap_local_page(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src + interlaced_offset,
- righthalf);
- kunmap_local(dst);
+const char *z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx,
+ void **dst, void **src, struct page **pgpl)
+{
+ struct z_erofs_decompress_req *rq = dctx->rq;
+ struct page **pgo, *tmppage;
+ unsigned int j;
+
+ if (!dctx->avail_out) {
+ if (++dctx->no >= rq->outpages || !rq->outputsize)
+ return "insufficient space for decompressed data";
+
+ if (dctx->kout)
+ kunmap_local(dctx->kout);
+ dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out);
+ rq->outputsize -= dctx->avail_out;
+ pgo = &rq->out[dctx->no];
+ if (!*pgo && rq->fillgaps) { /* deduped */
+ *pgo = erofs_allocpage(pgpl, rq->gfp);
+ if (!*pgo) {
+ dctx->kout = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+ set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE);
+ }
+ if (*pgo) {
+ dctx->kout = kmap_local_page(*pgo);
+ *dst = dctx->kout + rq->pageofs_out;
+ } else {
+ *dst = dctx->kout = NULL;
+ }
+ rq->pageofs_out = 0;
}
- if (outpages > inpages) {
- DBG_BUGON(!rq->out[outpages - 1]);
- if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
- dst = kmap_local_page(rq->out[outpages - 1]);
- memcpy(dst, interlaced_offset ? src :
- (src + righthalf), lefthalf);
- kunmap_local(dst);
- } else if (!interlaced_offset) {
- memmove(src, src + righthalf, lefthalf);
+ if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
+ if (++dctx->ni >= rq->inpages)
+ return "invalid compressed data";
+ if (dctx->kout) /* unlike kmap(), take care of the orders */
+ kunmap_local(dctx->kout);
+ kunmap_local(dctx->kin);
+
+ dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE);
+ rq->inputsize -= dctx->inbuf_sz;
+ dctx->kin = kmap_local_page(rq->in[dctx->ni]);
+ *src = dctx->kin;
+ dctx->bounced = false;
+ if (dctx->kout) {
+ j = (u8 *)*dst - dctx->kout;
+ dctx->kout = kmap_local_page(rq->out[dctx->no]);
+ *dst = dctx->kout + j;
}
+ dctx->inbuf_pos = 0;
}
- kunmap_local(src);
- return 0;
+
+ /*
+ * Handle overlapping: Use the given bounce buffer if the input data is
+ * under processing; Or utilize short-lived pages from the on-stack page
+ * pool, where pages are shared among the same request. Note that only
+ * a few inplace I/O pages need to be doubled.
+ */
+ if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) {
+ memcpy(dctx->bounce, *src, dctx->inbuf_sz);
+ *src = dctx->bounce;
+ dctx->bounced = true;
+ }
+
+ for (j = dctx->ni + 1; j < rq->inpages; ++j) {
+ if (rq->out[dctx->no] != rq->in[j])
+ continue;
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage)
+ return ERR_PTR(-ENOMEM);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ return NULL;
}
-static struct z_erofs_decompressor decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
+const struct z_erofs_decompressor *z_erofs_decomp[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "shifted"
},
- [Z_EROFS_COMPRESSION_INTERLACED] = {
+ [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "interlaced"
},
- [Z_EROFS_COMPRESSION_LZ4] = {
+ [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) {
+ .config = z_erofs_load_lz4_config,
.decompress = z_erofs_lz4_decompress,
+ .init = z_erofs_gbuf_init,
+ .exit = z_erofs_gbuf_exit,
.name = "lz4"
},
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
- [Z_EROFS_COMPRESSION_LZMA] = {
- .decompress = z_erofs_lzma_decompress,
- .name = "lzma"
- },
+ [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp,
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+ [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp,
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ZSTD
+ [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp,
#endif
};
-int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int algs, alg;
+ erofs_off_t offset;
+ int size, ret = 0;
+
+ if (!erofs_sb_has_compr_cfgs(sbi)) {
+ sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4;
+ return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ }
+
+ sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+ if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+ erofs_err(sb, "unidentified algorithms %x, please upgrade kernel",
+ sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+ return -EOPNOTSUPP;
+ }
+
+ (void)erofs_init_metabuf(&buf, sb, false);
+ offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+ alg = 0;
+ for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ const struct z_erofs_decompressor *dec = z_erofs_decomp[alg];
+ void *data;
+
+ if (!(algs & 1))
+ continue;
+
+ data = erofs_read_metadata(sb, &buf, &offset, &size);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+
+ if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) {
+ ret = dec->config(sb, dsb, data, size);
+ } else {
+ erofs_err(sb, "algorithm %d isn't enabled on this kernel",
+ alg);
+ ret = -EOPNOTSUPP;
+ }
+ kfree(data);
+ if (ret)
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ return ret;
+}
+
+int __init z_erofs_init_decompressor(void)
+{
+ int i, err;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) {
+ err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0;
+ if (err) {
+ while (i--)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+ return err;
+ }
+ }
+ return 0;
+}
+
+void z_erofs_exit_decompressor(void)
{
- return decompressors[rq->alg].decompress(rq, pagepool);
+ int i;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
}
diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c
new file mode 100644
index 000000000000..5ef6f71d3b7f
--- /dev/null
+++ b/fs/erofs/decompressor_crypto.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/scatterlist.h>
+#include <crypto/acompress.h>
+#include "compress.h"
+
+static int __z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct crypto_acomp *tfm)
+{
+ struct sg_table st_src, st_dst;
+ struct acomp_req *req;
+ struct crypto_wait wait;
+ const char *reason;
+ u8 *headpage;
+ int ret;
+
+ headpage = kmap_local_page(*rq->in);
+ reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ rq->sb->s_blocksize - rq->pageofs_in));
+ kunmap_local(headpage);
+ if (reason)
+ return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED;
+
+ req = acomp_request_alloc(tfm);
+ if (!req)
+ return -ENOMEM;
+
+ ret = sg_alloc_table_from_pages_segment(&st_src, rq->in, rq->inpages,
+ rq->pageofs_in, rq->inputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_src_alloc;
+
+ ret = sg_alloc_table_from_pages_segment(&st_dst, rq->out, rq->outpages,
+ rq->pageofs_out, rq->outputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_dst_alloc;
+
+ acomp_request_set_params(req, st_src.sgl,
+ st_dst.sgl, rq->inputsize, rq->outputsize);
+
+ crypto_init_wait(&wait);
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ ret = crypto_wait_req(crypto_acomp_decompress(req), &wait);
+ if (ret) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, rq->inputsize, rq->pageofs_in, rq->outputsize);
+ ret = -EIO;
+ }
+
+ sg_free_table(&st_dst);
+failed_dst_alloc:
+ sg_free_table(&st_src);
+failed_src_alloc:
+ acomp_request_free(req);
+ return ret;
+}
+
+struct z_erofs_crypto_engine {
+ char *crypto_name;
+ struct crypto_acomp *tfm;
+};
+
+struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = {
+ [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_LZMA] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_DEFLATE] = (struct z_erofs_crypto_engine[]) {
+ { .crypto_name = "qat_deflate", },
+ {},
+ },
+ [Z_EROFS_COMPRESSION_ZSTD] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+};
+static DECLARE_RWSEM(z_erofs_crypto_rwsem);
+
+static struct crypto_acomp *z_erofs_crypto_get_engine(int alg)
+{
+ struct z_erofs_crypto_engine *e;
+
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e)
+ if (e->tfm)
+ return e->tfm;
+ return NULL;
+}
+
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct crypto_acomp *tfm;
+ int i, err;
+
+ down_read(&z_erofs_crypto_rwsem);
+ tfm = z_erofs_crypto_get_engine(rq->alg);
+ if (!tfm) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ for (i = 0; i < rq->outpages; i++) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (!page) {
+ victim = __erofs_allocpage(pgpl, rq->gfp, true);
+ if (!victim) {
+ err = -ENOMEM;
+ goto out;
+ }
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
+ rq->out[i] = victim;
+ }
+ }
+ err = __z_erofs_crypto_decompress(rq, tfm);
+out:
+ up_read(&z_erofs_crypto_rwsem);
+ return err;
+}
+
+int z_erofs_crypto_enable_engine(const char *name, int len)
+{
+ struct z_erofs_crypto_engine *e;
+ struct crypto_acomp *tfm;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!strncmp(name, e->crypto_name, len)) {
+ if (e->tfm)
+ break;
+ tfm = crypto_alloc_acomp(e->crypto_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ up_write(&z_erofs_crypto_rwsem);
+ return -EOPNOTSUPP;
+ }
+ e->tfm = tfm;
+ break;
+ }
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+ return 0;
+}
+
+void z_erofs_crypto_disable_all_engines(void)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ crypto_free_acomp(e->tfm);
+ e->tfm = NULL;
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+}
+
+int z_erofs_crypto_show_engines(char *buf, int size, char sep)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg, len = 0;
+
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ len += scnprintf(buf + len, size - len, "%s%c",
+ e->crypto_name, sep);
+ }
+ }
+ return len;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
new file mode 100644
index 000000000000..3fb73000ed27
--- /dev/null
+++ b/fs/erofs/decompressor_deflate.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/zlib.h>
+#include "compress.h"
+
+struct z_erofs_deflate {
+ struct z_erofs_deflate *next;
+ struct z_stream_s z;
+ u8 bounce[PAGE_SIZE];
+};
+
+static DEFINE_SPINLOCK(z_erofs_deflate_lock);
+static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms;
+static struct z_erofs_deflate *z_erofs_deflate_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
+
+module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
+
+static void z_erofs_deflate_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ continue;
+ }
+ z_erofs_deflate_head = NULL;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ while (strm) {
+ struct z_erofs_deflate *n = strm->next;
+
+ vfree(strm->z.workspace);
+ kfree(strm);
+ --z_erofs_deflate_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+static int __init z_erofs_deflate_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_deflate_nstrms)
+ z_erofs_deflate_nstrms = num_possible_cpus();
+ return 0;
+}
+
+static int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ struct z_erofs_deflate_cfgs *dfl = data;
+ static DEFINE_MUTEX(deflate_resize_mutex);
+ static bool inited;
+
+ if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
+ erofs_err(sb, "invalid deflate cfgs, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (dfl->windowbits > MAX_WBITS) {
+ erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
+ return -EOPNOTSUPP;
+ }
+ mutex_lock(&deflate_resize_mutex);
+ if (!inited) {
+ for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
+ ++z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm)
+ goto failed;
+ /* XXX: in-kernel zlib cannot customize windowbits */
+ strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!strm->z.workspace) {
+ kfree(strm);
+ goto failed;
+ }
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ }
+ inited = true;
+ }
+ mutex_unlock(&deflate_resize_mutex);
+ erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
+ return 0;
+failed:
+ mutex_unlock(&deflate_resize_mutex);
+ z_erofs_deflate_exit();
+ return -ENOMEM;
+}
+
+static const char *__z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
+ struct z_erofs_deflate *strm;
+ const char *reason;
+ int zerr;
+
+ /* 1. get the exact DEFLATE compressed size */
+ dctx.kin = kmap_local_page(*rq->in);
+ reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
+ if (reason) {
+ kunmap_local(dctx.kin);
+ return reason;
+ }
+
+ /* 2. get an available DEFLATE context */
+again:
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head));
+ goto again;
+ }
+ z_erofs_deflate_head = strm->next;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ /* 3. multi-call decompress */
+ zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
+ if (zerr != Z_OK) {
+ reason = ERR_PTR(-EINVAL);
+ goto failed_zinit;
+ }
+
+ rq->fillgaps = true; /* DEFLATE doesn't support NULL output buffer */
+ strm->z.avail_in = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= strm->z.avail_in;
+ strm->z.next_in = dctx.kin + rq->pageofs_in;
+ strm->z.avail_out = 0;
+ dctx.bounce = strm->bounce;
+
+ while (1) {
+ dctx.avail_out = strm->z.avail_out;
+ dctx.inbuf_sz = strm->z.avail_in;
+ reason = z_erofs_stream_switch_bufs(&dctx,
+ (void **)&strm->z.next_out,
+ (void **)&strm->z.next_in, pgpl);
+ if (reason)
+ break;
+ strm->z.avail_out = dctx.avail_out;
+ strm->z.avail_in = dctx.inbuf_sz;
+
+ zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
+ if (zerr != Z_OK || !(rq->outputsize + strm->z.avail_out)) {
+ if (zerr == Z_OK && rq->partial_decoding)
+ break;
+ if (zerr == Z_STREAM_END && !rq->outputsize)
+ break;
+ reason = (zerr == Z_DATA_ERROR ?
+ "corrupted compressed data" :
+ "unexpected end of stream");
+ break;
+ }
+ }
+ if (zlib_inflateEnd(&strm->z) != Z_OK && !reason)
+ reason = ERR_PTR(-EIO);
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+failed_zinit:
+ kunmap_local(dctx.kin);
+ /* 4. push back DEFLATE stream context to the global list */
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ wake_up(&z_erofs_deflate_wq);
+ return reason;
+}
+
+static const char *z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ int err;
+
+ if (!rq->partial_decoding) {
+ err = z_erofs_crypto_decompress(rq, pgpl);
+ if (err != -EOPNOTSUPP)
+ return ERR_PTR(err);
+
+ }
+#endif
+ return __z_erofs_deflate_decompress(rq, pgpl);
+}
+
+const struct z_erofs_decompressor z_erofs_deflate_decomp = {
+ .config = z_erofs_load_deflate_config,
+ .decompress = z_erofs_deflate_decompress,
+ .init = z_erofs_deflate_init,
+ .exit = z_erofs_deflate_exit,
+ .name = "deflate",
+};
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 091fd5adf818..b4ea6978faae 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -1,12 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/xz.h>
-#include <linux/module.h>
#include "compress.h"
struct z_erofs_lzma {
struct z_erofs_lzma *next;
struct xz_dec_microlzma *state;
- struct xz_buf buf;
u8 bounce[PAGE_SIZE];
};
@@ -19,7 +17,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
-void z_erofs_lzma_exit(void)
+static void z_erofs_lzma_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_lzma_avail_strms) {
@@ -47,7 +45,7 @@ void z_erofs_lzma_exit(void)
}
}
-int z_erofs_lzma_init(void)
+static int __init z_erofs_lzma_init(void)
{
unsigned int i;
@@ -71,11 +69,11 @@ int z_erofs_lzma_init(void)
return 0;
}
-int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lzma_cfgs *lzma, int size)
+static int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
{
static DEFINE_MUTEX(lzma_resize_mutex);
+ struct z_erofs_lzma_cfgs *lzma = data;
unsigned int dict_size, i;
struct z_erofs_lzma *strm, *head = NULL;
int err;
@@ -96,8 +94,6 @@ int z_erofs_load_lzma_config(struct super_block *sb,
return -EINVAL;
}
- erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
-
/* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
mutex_lock(&lzma_resize_mutex);
@@ -150,27 +146,23 @@ again:
return err;
}
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static const char *z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- unsigned int inlen, outlen, pageofs;
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
+ struct xz_buf buf = {};
struct z_erofs_lzma *strm;
- u8 *kin;
- bool bounced = false;
- int no, ni, j, err = 0;
+ enum xz_ret xz_err;
+ const char *reason;
/* 1. get the exact LZMA compressed size */
- kin = kmap(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- EROFS_BLKSIZ - rq->pageofs_in));
- if (err) {
- kunmap(*rq->in);
- return err;
+ dctx.kin = kmap_local_page(*rq->in);
+ reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
+ if (reason) {
+ kunmap_local(dctx.kin);
+ return reason;
}
/* 2. get an available lzma context */
@@ -186,106 +178,58 @@ again:
spin_unlock(&z_erofs_lzma_lock);
/* 3. multi-call decompress */
- inlen = rq->inputsize;
- outlen = rq->outputsize;
- xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ xz_dec_microlzma_reset(strm->state, rq->inputsize, rq->outputsize,
!rq->partial_decoding);
- pageofs = rq->pageofs_out;
- strm->buf.in = kin + rq->pageofs_in;
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
- inlen -= strm->buf.in_size;
- strm->buf.out = NULL;
- strm->buf.out_pos = 0;
- strm->buf.out_size = 0;
-
- for (ni = 0, no = -1;;) {
- enum xz_ret xz_err;
-
- if (strm->buf.out_pos == strm->buf.out_size) {
- if (strm->buf.out) {
- kunmap(rq->out[no]);
- strm->buf.out = NULL;
- }
-
- if (++no >= nrpages_out || !outlen) {
- erofs_err(rq->sb, "decompressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.out_pos = 0;
- strm->buf.out_size = min_t(u32, outlen,
- PAGE_SIZE - pageofs);
- outlen -= strm->buf.out_size;
- if (!rq->out[no] && rq->fillgaps) /* deduped */
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
- if (rq->out[no])
- strm->buf.out = kmap(rq->out[no]) + pageofs;
- pageofs = 0;
- } else if (strm->buf.in_pos == strm->buf.in_size) {
- kunmap(rq->in[ni]);
-
- if (++ni >= nrpages_in || !inlen) {
- erofs_err(rq->sb, "compressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
- inlen -= strm->buf.in_size;
- kin = kmap(rq->in[ni]);
- strm->buf.in = kin;
- bounced = false;
- }
+ buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= buf.in_size;
+ buf.in = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+ do {
+ dctx.avail_out = buf.out_size - buf.out_pos;
+ dctx.inbuf_sz = buf.in_size;
+ dctx.inbuf_pos = buf.in_pos;
+ reason = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out,
+ (void **)&buf.in, pgpl);
+ if (reason)
+ break;
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Otherwise, Use short-lived pages
- * from the on-stack pagepool where pages share with the same
- * request.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
- strm->buf.in = strm->bounce;
- bounced = true;
+ if (buf.out_size == buf.out_pos) {
+ buf.out_size = dctx.avail_out;
+ buf.out_pos = 0;
}
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
-
- if (rq->out[no] != rq->in[j])
- continue;
+ buf.in_size = dctx.inbuf_sz;
+ buf.in_pos = dctx.inbuf_pos;
- DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
- rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
- xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
- DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
- DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+ xz_err = xz_dec_microlzma_run(strm->state, &buf);
+ DBG_BUGON(buf.out_pos > buf.out_size);
+ DBG_BUGON(buf.in_pos > buf.in_size);
if (xz_err != XZ_OK) {
- if (xz_err == XZ_STREAM_END && !outlen)
+ if (xz_err == XZ_STREAM_END && !rq->outputsize)
break;
- erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
- xz_err, rq->inputsize, rq->outputsize);
- err = -EFSCORRUPTED;
+ reason = (xz_err == XZ_DATA_ERROR ?
+ "corrupted compressed data" :
+ "unexpected end of stream");
break;
}
- }
- if (no < nrpages_out && strm->buf.out)
- kunmap(rq->in[no]);
- if (ni < nrpages_in)
- kunmap(rq->in[ni]);
+ } while (1);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+ kunmap_local(dctx.kin);
/* 4. push back LZMA stream context to the global list */
spin_lock(&z_erofs_lzma_lock);
strm->next = z_erofs_lzma_head;
z_erofs_lzma_head = strm;
spin_unlock(&z_erofs_lzma_lock);
wake_up(&z_erofs_lzma_wq);
- return err;
+ return reason;
}
+
+const struct z_erofs_decompressor z_erofs_lzma_decomp = {
+ .config = z_erofs_load_lzma_config,
+ .decompress = z_erofs_lzma_decompress,
+ .init = z_erofs_lzma_init,
+ .exit = z_erofs_lzma_exit,
+ .name = "lzma"
+};
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
new file mode 100644
index 000000000000..beae49165c69
--- /dev/null
+++ b/fs/erofs/decompressor_zstd.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/zstd.h>
+#include "compress.h"
+
+struct z_erofs_zstd {
+ struct z_erofs_zstd *next;
+ u8 bounce[PAGE_SIZE];
+ void *wksp;
+ unsigned int wkspsz;
+};
+
+static DEFINE_SPINLOCK(z_erofs_zstd_lock);
+static unsigned int z_erofs_zstd_max_dictsize;
+static unsigned int z_erofs_zstd_nstrms, z_erofs_zstd_avail_strms;
+static struct z_erofs_zstd *z_erofs_zstd_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_zstd_wq);
+
+module_param_named(zstd_streams, z_erofs_zstd_nstrms, uint, 0444);
+
+static struct z_erofs_zstd *z_erofs_isolate_strms(bool all)
+{
+ struct z_erofs_zstd *strm;
+
+again:
+ spin_lock(&z_erofs_zstd_lock);
+ strm = z_erofs_zstd_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_zstd_lock);
+ wait_event(z_erofs_zstd_wq, READ_ONCE(z_erofs_zstd_head));
+ goto again;
+ }
+ z_erofs_zstd_head = all ? NULL : strm->next;
+ spin_unlock(&z_erofs_zstd_lock);
+ return strm;
+}
+
+static void z_erofs_zstd_exit(void)
+{
+ while (z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *strm, *n;
+
+ for (strm = z_erofs_isolate_strms(true); strm; strm = n) {
+ n = strm->next;
+
+ kvfree(strm->wksp);
+ kfree(strm);
+ --z_erofs_zstd_avail_strms;
+ }
+ }
+}
+
+static int __init z_erofs_zstd_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_zstd_nstrms)
+ z_erofs_zstd_nstrms = num_possible_cpus();
+
+ for (; z_erofs_zstd_avail_strms < z_erofs_zstd_nstrms;
+ ++z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm) {
+ z_erofs_zstd_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_zstd_lock);
+ strm->next = z_erofs_zstd_head;
+ z_erofs_zstd_head = strm;
+ spin_unlock(&z_erofs_zstd_lock);
+ }
+ return 0;
+}
+
+static int z_erofs_load_zstd_config(struct super_block *sb,
+ struct erofs_super_block *dsb, void *data, int size)
+{
+ static DEFINE_MUTEX(zstd_resize_mutex);
+ struct z_erofs_zstd_cfgs *zstd = data;
+ unsigned int dict_size, wkspsz;
+ struct z_erofs_zstd *strm, *head = NULL;
+ void *wksp;
+
+ if (!zstd || size < sizeof(struct z_erofs_zstd_cfgs) || zstd->format) {
+ erofs_err(sb, "unsupported zstd format, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (zstd->windowlog > ilog2(Z_EROFS_ZSTD_MAX_DICT_SIZE) - 10) {
+ erofs_err(sb, "unsupported zstd window log %u", zstd->windowlog);
+ return -EINVAL;
+ }
+ dict_size = 1U << (zstd->windowlog + 10);
+
+ /* in case 2 z_erofs_load_zstd_config() race to avoid deadlock */
+ mutex_lock(&zstd_resize_mutex);
+ if (z_erofs_zstd_max_dictsize >= dict_size) {
+ mutex_unlock(&zstd_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ while (z_erofs_zstd_avail_strms) {
+ struct z_erofs_zstd *n;
+
+ for (strm = z_erofs_isolate_strms(true); strm; strm = n) {
+ n = strm->next;
+ strm->next = head;
+ head = strm;
+ --z_erofs_zstd_avail_strms;
+ }
+ }
+
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ wkspsz = zstd_dstream_workspace_bound(dict_size);
+ for (strm = head; strm; strm = strm->next) {
+ wksp = kvmalloc(wkspsz, GFP_KERNEL);
+ if (!wksp)
+ break;
+ kvfree(strm->wksp);
+ strm->wksp = wksp;
+ strm->wkspsz = wkspsz;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_zstd_lock);
+ DBG_BUGON(z_erofs_zstd_head);
+ z_erofs_zstd_head = head;
+ spin_unlock(&z_erofs_zstd_lock);
+ z_erofs_zstd_avail_strms = z_erofs_zstd_nstrms;
+ wake_up_all(&z_erofs_zstd_wq);
+ if (!strm)
+ z_erofs_zstd_max_dictsize = dict_size;
+ mutex_unlock(&zstd_resize_mutex);
+ return strm ? -ENOMEM : 0;
+}
+
+static const char *z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
+ zstd_in_buffer in_buf = { NULL, 0, 0 };
+ zstd_out_buffer out_buf = { NULL, 0, 0 };
+ struct z_erofs_zstd *strm;
+ zstd_dstream *stream;
+ const char *reason;
+ int zerr;
+
+ /* 1. get the exact compressed size */
+ dctx.kin = kmap_local_page(*rq->in);
+ reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
+ if (reason) {
+ kunmap_local(dctx.kin);
+ return reason;
+ }
+
+ /* 2. get an available ZSTD context */
+ strm = z_erofs_isolate_strms(false);
+
+ /* 3. multi-call decompress */
+ stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz);
+ if (!stream) {
+ reason = ERR_PTR(-ENOMEM);
+ goto failed_zinit;
+ }
+
+ rq->fillgaps = true; /* ZSTD doesn't support NULL output buffer */
+ in_buf.size = min_t(u32, rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= in_buf.size;
+ in_buf.src = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+
+ do {
+ dctx.inbuf_sz = in_buf.size;
+ dctx.inbuf_pos = in_buf.pos;
+ reason = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
+ (void **)&in_buf.src, pgpl);
+ if (reason)
+ break;
+
+ if (out_buf.size == out_buf.pos) {
+ out_buf.size = dctx.avail_out;
+ out_buf.pos = 0;
+ }
+ in_buf.size = dctx.inbuf_sz;
+ in_buf.pos = dctx.inbuf_pos;
+
+ zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ if (zstd_is_error(zerr) ||
+ ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 &&
+ !(rq->inputsize + in_buf.size - in_buf.pos))))) {
+ reason = zstd_is_error(zerr) ? zstd_get_error_name(zerr) :
+ "unexpected end of stream";
+ break;
+ }
+ } while (rq->outputsize + dctx.avail_out);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+failed_zinit:
+ kunmap_local(dctx.kin);
+ /* 4. push back ZSTD stream context to the global list */
+ spin_lock(&z_erofs_zstd_lock);
+ strm->next = z_erofs_zstd_head;
+ z_erofs_zstd_head = strm;
+ spin_unlock(&z_erofs_zstd_lock);
+ wake_up(&z_erofs_zstd_wq);
+ return reason;
+}
+
+const struct z_erofs_decompressor z_erofs_zstd_decomp = {
+ .config = z_erofs_load_zstd_config,
+ .decompress = z_erofs_zstd_decompress,
+ .init = z_erofs_zstd_init,
+ .exit = z_erofs_zstd_exit,
+ .name = "zstd",
+};
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index ecf28f66b97d..32b4f5aa60c9 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -6,36 +6,17 @@
*/
#include "internal.h"
-static void debug_one_dentry(unsigned char d_type, const char *de_name,
- unsigned int de_namelen)
-{
-#ifdef CONFIG_EROFS_FS_DEBUG
- /* since the on-disk name could not have the trailing '\0' */
- unsigned char dbg_namebuf[EROFS_NAME_LEN + 1];
-
- memcpy(dbg_namebuf, de_name, de_namelen);
- dbg_namebuf[de_namelen] = '\0';
-
- erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf,
- de_namelen, d_type);
-#endif
-}
-
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
void *dentry_blk, struct erofs_dirent *de,
- unsigned int nameoff, unsigned int maxsize)
+ unsigned int nameoff0, unsigned int maxsize)
{
- const struct erofs_dirent *end = dentry_blk + nameoff;
+ const struct erofs_dirent *end = dentry_blk + nameoff0;
while (de < end) {
- const char *de_name;
+ unsigned char d_type = fs_ftype_to_dtype(de->file_type);
+ unsigned int nameoff = le16_to_cpu(de->nameoff);
+ const char *de_name = (char *)dentry_blk + nameoff;
unsigned int de_namelen;
- unsigned char d_type;
-
- d_type = fs_ftype_to_dtype(de->file_type);
-
- nameoff = le16_to_cpu(de->nameoff);
- de_name = (char *)dentry_blk + nameoff;
/* the last dirent in the block? */
if (de + 1 >= end)
@@ -52,10 +33,9 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
return -EFSCORRUPTED;
}
- debug_one_dentry(d_type, de_name, de_namelen);
if (!dir_emit(ctx, de_name, de_namelen,
- le64_to_cpu(de->nid), d_type))
- /* stopped by some reason */
+ erofs_nid_to_ino64(EROFS_SB(dir->i_sb),
+ le64_to_cpu(de->nid)), d_type))
return 1;
++de;
ctx->pos += sizeof(struct erofs_dirent);
@@ -67,58 +47,75 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- const size_t dirsize = i_size_read(dir);
- unsigned int i = ctx->pos / EROFS_BLKSIZ;
- unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
+ struct super_block *sb = dir->i_sb;
+ struct file_ra_state *ra = &f->f_ra;
+ unsigned long bsz = sb->s_blocksize;
+ unsigned int ofs = erofs_blkoff(sb, ctx->pos);
+ pgoff_t ra_pages = DIV_ROUND_UP_POW2(
+ EROFS_I_SB(dir)->dir_ra_bytes, PAGE_SIZE);
+ pgoff_t nr_pages = DIV_ROUND_UP_POW2(dir->i_size, PAGE_SIZE);
int err = 0;
bool initial = true;
- while (ctx->pos < dirsize) {
+ buf.mapping = dir->i_mapping;
+ while (ctx->pos < dir->i_size) {
+ erofs_off_t dbstart = ctx->pos - ofs;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, dir, i, EROFS_KMAP);
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ /* readahead blocks to enhance performance for large directories */
+ if (ra_pages) {
+ pgoff_t idx = DIV_ROUND_UP_POW2(ctx->pos, PAGE_SIZE);
+ pgoff_t pages = min(nr_pages - idx, ra_pages);
+
+ if (pages > 1 && !ra_has_index(ra, idx))
+ page_cache_sync_readahead(dir->i_mapping, ra,
+ f, idx, pages);
+ }
+
+ de = erofs_bread(&buf, dbstart, true);
if (IS_ERR(de)) {
- erofs_err(dir->i_sb,
- "fail to readdir of logical block %u of nid %llu",
- i, EROFS_I(dir)->nid);
+ erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
+ erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
}
nameoff = le16_to_cpu(de->nameoff);
- if (nameoff < sizeof(struct erofs_dirent) ||
- nameoff >= EROFS_BLKSIZ) {
- erofs_err(dir->i_sb,
- "invalid de[0].nameoff %u @ nid %llu",
+ if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) {
+ erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu",
nameoff, EROFS_I(dir)->nid);
err = -EFSCORRUPTED;
break;
}
- maxsize = min_t(unsigned int,
- dirsize - ctx->pos + ofs, EROFS_BLKSIZ);
-
+ maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
-
ofs = roundup(ofs, sizeof(struct erofs_dirent));
- ctx->pos = blknr_to_addr(i) + ofs;
- if (ofs >= nameoff)
- goto skip_this;
+ ctx->pos = dbstart + ofs;
}
err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
if (err)
break;
-skip_this:
- ctx->pos = blknr_to_addr(i) + maxsize;
- ++i;
+ ctx->pos = dbstart + maxsize;
ofs = 0;
+ cond_resched();
}
erofs_put_metabuf(&buf);
+ if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
+ if (!dir_emit_dot(f, ctx))
+ return 0;
+ ++ctx->pos;
+ }
return err < 0 ? err : 0;
}
@@ -126,4 +123,8 @@ const struct file_operations erofs_dir_fops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate_shared = erofs_readdir,
+ .unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = erofs_compat_ioctl,
+#endif
};
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index dbcd24371002..e24268acdd62 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/* SPDX-License-Identifier: MIT */
/*
* EROFS (Enhanced ROM File System) on-disk format definition
*
@@ -9,10 +9,15 @@
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
+/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -27,41 +32,39 @@
#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
+#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
+#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
+#define EROFS_FEATURE_INCOMPAT_METABOX 0x00000100
#define EROFS_ALL_FEATURE_INCOMPAT \
- (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
- EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
- EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
- EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
- EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
- EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
- EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
- EROFS_FEATURE_INCOMPAT_DEDUPE)
+ ((EROFS_FEATURE_INCOMPAT_METABOX << 1) - 1)
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
u8 tag[64]; /* digest(sha256), etc. */
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
- u8 reserved[56];
+ __le32 blocks_lo; /* total blocks count of this device */
+ __le32 uniaddr_lo; /* unified starting block of this device */
+ __le32 blocks_hi; /* total blocks count MSB */
+ __le16 uniaddr_hi; /* unified starting block MSB */
+ u8 reserved[50];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
-/* erofs on-disk super block (currently 128 bytes) */
+/* erofs on-disk super block (currently 144 bytes at maximum) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
- __le32 checksum; /* crc32c(super_block) */
+ __le32 checksum; /* crc32c to avoid unexpected on-disk overlap */
__le32 feature_compat;
- __u8 blkszbits; /* support block_size == PAGE_SIZE only */
+ __u8 blkszbits; /* filesystem block size in bit shift */
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
-
- __le16 root_nid; /* nid of root directory */
+ union {
+ __le16 rootnid_2b; /* nid of root directory */
+ __le16 blocks_hi; /* (48BIT on) blocks count MSB */
+ } __packed rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
-
- __le64 build_time; /* compact inode time derivation */
- __le32 build_time_nsec; /* compact inode time derivation in ns scale */
- __le32 blocks; /* used for statfs */
+ __le64 epoch; /* base seconds used for compact inodes */
+ __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
+ __le32 blocks_lo; /* blocks count LSB */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
__u8 uuid[16]; /* 128-bit uuid for volume */
@@ -75,121 +78,110 @@ struct erofs_super_block {
} __packed u1;
__le16 extra_devices; /* # of devices besides the primary device */
__le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
- __u8 reserved[6];
+ __u8 dirblkbits; /* directory block size in bit shift */
+ __u8 xattr_prefix_count; /* # of long xattr name prefixes */
+ __le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
- __u8 reserved2[24];
+ __u8 xattr_filter_reserved; /* reserved for xattr name filter */
+ __u8 reserved[3];
+ __le32 build_time; /* seconds added to epoch for mkfs time */
+ __le64 rootnid_8b; /* (48BIT on) nid of root directory */
+ __le64 reserved2;
+ __le64 metabox_nid; /* (METABOX on) nid of the metabox inode */
+ __le64 reserved3; /* [align to extslot 1] */
};
/*
- * erofs inode datalayout (i_format in on-disk inode):
+ * EROFS inode datalayout (i_format in on-disk inode):
* 0 - uncompressed flat inode without tail-packing inline data:
- * inode, [xattrs], ... | ... | no-holed data
* 1 - compressed inode with non-compact indexes:
- * inode, [xattrs], [map_header], extents ... | ...
* 2 - uncompressed flat inode with tail-packing inline data:
- * inode, [xattrs], tailpacking data, ... | ... | no-holed data
* 3 - compressed inode with compact indexes:
- * inode, [xattrs], map_header, extents ... | ...
* 4 - chunk-based inode with (optional) multi-device support:
- * inode, [xattrs], chunk indexes ... | ...
* 5~7 - reserved
*/
enum {
EROFS_INODE_FLAT_PLAIN = 0,
- EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1,
+ EROFS_INODE_COMPRESSED_FULL = 1,
EROFS_INODE_FLAT_INLINE = 2,
- EROFS_INODE_FLAT_COMPRESSION = 3,
+ EROFS_INODE_COMPRESSED_COMPACT = 3,
EROFS_INODE_CHUNK_BASED = 4,
EROFS_INODE_DATALAYOUT_MAX
};
static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
{
- return datamode == EROFS_INODE_FLAT_COMPRESSION ||
- datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
+ return datamode == EROFS_INODE_COMPRESSED_COMPACT ||
+ datamode == EROFS_INODE_COMPRESSED_FULL;
}
/* bit definitions of inode i_format */
-#define EROFS_I_VERSION_BITS 1
-#define EROFS_I_DATALAYOUT_BITS 3
-
-#define EROFS_I_VERSION_BIT 0
-#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_VERSION_MASK 0x01
+#define EROFS_I_DATALAYOUT_MASK 0x07
-#define EROFS_I_ALL \
- ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+#define EROFS_I_VERSION_BIT 0
+#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */
+#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */
+#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
-/* with chunk indexes or just a 4-byte blkaddr array */
+/* with chunk indexes or just a 4-byte block array */
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+#define EROFS_CHUNK_FORMAT_48BIT 0x0040
-#define EROFS_CHUNK_FORMAT_ALL \
- (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)
+
+/* 32-byte on-disk inode */
+#define EROFS_INODE_LAYOUT_COMPACT 0
+/* 64-byte on-disk inode */
+#define EROFS_INODE_LAYOUT_EXTENDED 1
struct erofs_inode_chunk_info {
__le16 format; /* chunk blkbits, etc. */
__le16 reserved;
};
+union erofs_inode_i_u {
+ __le32 blocks_lo; /* total blocks count (if compressed inodes) */
+ __le32 startblk_lo; /* starting block number (if flat inodes) */
+ __le32 rdev; /* device ID (if special inodes) */
+ struct erofs_inode_chunk_info c;
+};
+
+union erofs_inode_i_nb {
+ __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
+ __le16 blocks_hi; /* total blocks count MSB */
+ __le16 startblk_hi; /* starting block number MSB */
+} __packed;
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_nlink;
+ union erofs_inode_i_nb i_nb;
__le32 i_size;
- __le32 i_reserved;
- union {
- /* total compressed blocks for compressed inodes */
- __le32 compressed_blocks;
- /* block address for uncompressed flat inodes */
- __le32 raw_blkaddr;
-
- /* for device files, used to indicate old/new device # */
- __le32 rdev;
-
- /* for chunk-based files, it contains the summary info */
- struct erofs_inode_chunk_info c;
- } i_u;
- __le32 i_ino; /* only used for 32-bit stat compatibility */
+ __le32 i_mtime;
+ union erofs_inode_i_u i_u;
+
+ __le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
__le16 i_gid;
- __le32 i_reserved2;
+ __le32 i_reserved;
};
-/* 32-byte on-disk inode */
-#define EROFS_INODE_LAYOUT_COMPACT 0
-/* 64-byte on-disk inode */
-#define EROFS_INODE_LAYOUT_EXTENDED 1
-
/* 64-byte complete form of an ondisk inode */
struct erofs_inode_extended {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_reserved;
+ union erofs_inode_i_nb i_nb;
__le64 i_size;
- union {
- /* total compressed blocks for compressed inodes */
- __le32 compressed_blocks;
- /* block address for uncompressed flat inodes */
- __le32 raw_blkaddr;
-
- /* for device files, used to indicate old/new device # */
- __le32 rdev;
-
- /* for chunk-based files, it contains the summary info */
- struct erofs_inode_chunk_info c;
- } i_u;
-
- /* only used for 32-bit stat compatibility */
- __le32 i_ino;
+ union erofs_inode_i_u i_u;
+ __le32 i_ino; /* only used for 32-bit stat compatibility */
__le32 i_uid;
__le32 i_gid;
__le64 i_mtime;
@@ -198,10 +190,6 @@ struct erofs_inode_extended {
__u8 i_reserved2[16];
};
-#define EROFS_MAX_SHARED_XATTRS (128)
-/* h_shared_count between 129 ... 255 are special # */
-#define EROFS_SHARED_XATTR_EXTENT (255)
-
/*
* inline xattrs (n == i_xattr_icount):
* erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes
@@ -214,7 +202,7 @@ struct erofs_inode_extended {
* for read-only fs, no need to introduce h_refcount
*/
struct erofs_xattr_ibody_header {
- __le32 h_reserved;
+ __le32 h_name_filter; /* bit value 1 indicates not-present */
__u8 h_shared_count;
__u8 h_reserved2[7];
__le32 h_shared_xattrs[]; /* shared xattr id array */
@@ -228,6 +216,17 @@ struct erofs_xattr_ibody_header {
#define EROFS_XATTR_INDEX_LUSTRE 5
#define EROFS_XATTR_INDEX_SECURITY 6
+/*
+ * bit 7 of e_name_index is set when it refers to a long xattr name prefix,
+ * while the remained lower bits represent the index of the prefix.
+ */
+#define EROFS_XATTR_LONG_PREFIX 0x80
+#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f
+
+#define EROFS_XATTR_FILTER_BITS 32
+#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX
+#define EROFS_XATTR_FILTER_SEED 0x25BBE08F
+
/* xattr entry (for both inline & shared xattrs) */
struct erofs_xattr_entry {
__u8 e_name_len; /* length of name */
@@ -237,11 +236,18 @@ struct erofs_xattr_entry {
char e_name[]; /* attribute name */
};
+/* long xattr name prefix */
+struct erofs_xattr_long_prefix {
+ __u8 base_index; /* short xattr name prefix index */
+ char infix[]; /* infix apart from short prefix */
+};
+
static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
{
if (!i_xattr_icount)
return 0;
+ /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
return sizeof(struct erofs_xattr_ibody_header) +
sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
}
@@ -260,20 +266,44 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
-/* 8-byte inode chunk indexes */
+/* 8-byte inode chunk index */
struct erofs_inode_chunk_index {
- __le16 advise; /* always 0, don't care for now */
+ __le16 startblk_hi; /* starting block number MSB */
__le16 device_id; /* back-end storage id (with bits masked) */
- __le32 blkaddr; /* start block address of this inode chunk */
+ __le32 startblk_lo; /* starting block number of this chunk */
};
-/* maximum supported size of a physical compression cluster */
+#define EROFS_DIRENT_NID_METABOX_BIT 63
+#define EROFS_DIRENT_NID_MASK (BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT) - 1)
+
+/* dirent sorts in alphabet order, thus we can do binary search */
+struct erofs_dirent {
+ __le64 nid; /* node number */
+ __le16 nameoff; /* start offset of file name */
+ __u8 file_type; /* file type */
+ __u8 reserved; /* reserved */
+} __packed;
+
+/*
+ * EROFS file types should match generic FT_* types and
+ * it seems no need to add BUILD_BUG_ONs since potential
+ * unmatchness will break other fses as well...
+ */
+
+#define EROFS_NAME_LEN 255
+
+/* maximum supported encoded size of a physical compressed cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+/* maximum supported decoded size of a physical compressed cluster */
+#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024)
+
/* available compression algorithm types (for h_algorithmtype) */
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_LZMA = 1,
+ Z_EROFS_COMPRESSION_DEFLATE = 2,
+ Z_EROFS_COMPRESSION_ZSTD = 3,
Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
@@ -294,22 +324,36 @@ struct z_erofs_lzma_cfgs {
#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_deflate_cfgs {
+ u8 windowbits; /* 8..15 for DEFLATE */
+ u8 reserved[5];
+} __packed;
+
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_zstd_cfgs {
+ u8 format;
+ u8 windowlog; /* windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN(10) */
+ u8 reserved[4];
+} __packed;
+
+#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE
+
/*
- * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
- * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
- * (4B) + 2B + (4B) if compacted 2B is on.
- * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
- * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
- * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
- * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
- * bit 5 : fragment pcluster (0 - off; 1 - on)
+ * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes:
+ * 4B (disabled) vs 4B+2B+4B (enabled)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */
+#define Z_EROFS_ADVISE_EXTENTS 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+/* Indicate the record size for each extent if extent metadata is used */
+#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1
+#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3
#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
@@ -321,104 +365,76 @@ struct z_erofs_map_header {
/* indicates the encoded size of tailpacking data */
__le16 h_idata_size;
};
+ __le32 h_extents_lo; /* extent count LSB */
};
__le16 h_advise;
- /*
- * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
- * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
- */
- __u8 h_algorithmtype;
- /*
- * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-6 : reserved;
- * bit 7 : move the whole file into packed inode or not.
- */
- __u8 h_clusterbits;
+ union {
+ struct {
+ /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */
+ __u8 h_algorithmtype;
+ /*
+ * bit 0-3 : logical cluster bits - blkszbits
+ * bit 4-6 : reserved
+ * bit 7 : pack the whole file into packed inode
+ */
+ __u8 h_clusterbits;
+ } __packed;
+ __le16 h_extents_hi; /* extent count MSB */
+ } __packed;
};
-#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
-
-/*
- * Fixed-sized output compression on-disk logical cluster type:
- * 0 - literal (uncompressed) lcluster
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * 2 - compressed lcluster (for NONHEAD lclusters)
- *
- * In detail,
- * 0 - literal (uncompressed) lcluster,
- * di_advise = 0
- * di_clusterofs = the literal data offset of the lcluster
- * di_blkaddr = the blkaddr of the literal pcluster
- *
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * di_advise = 1 or 3
- * di_clusterofs = the decompressed data offset of the lcluster
- * di_blkaddr = the blkaddr of the compressed pcluster
- *
- * 2 - compressed lcluster (for NONHEAD lclusters)
- * di_advise = 2
- * di_clusterofs =
- * the decompressed data offset in its own HEAD lcluster
- * di_u.delta[0] = distance to this HEAD lcluster
- * di_u.delta[1] = distance to the next HEAD lcluster
- */
enum {
- Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
- Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
- Z_EROFS_VLE_CLUSTER_TYPE_MAX
+ Z_EROFS_LCLUSTER_TYPE_PLAIN = 0,
+ Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1,
+ Z_EROFS_LCLUSTER_TYPE_NONHEAD = 2,
+ Z_EROFS_LCLUSTER_TYPE_HEAD2 = 3,
+ Z_EROFS_LCLUSTER_TYPE_MAX
};
-#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
-#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
+#define Z_EROFS_LI_LCLUSTER_TYPE_MASK (Z_EROFS_LCLUSTER_TYPE_MAX - 1)
/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
-#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15)
+#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
-/*
- * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
- * compressed block count of a compressed extent (in logical clusters, aka.
- * block count of a pcluster).
- */
-#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11)
+/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */
+#define Z_EROFS_LI_D0_CBLKCNT (1 << 11)
-struct z_erofs_vle_decompressed_index {
+struct z_erofs_lcluster_index {
__le16 di_advise;
/* where to decompress in the head lcluster */
__le16 di_clusterofs;
union {
- /* for the HEAD lclusters */
- __le32 blkaddr;
+ __le32 blkaddr; /* for the HEAD lclusters */
/*
- * for the NONHEAD lclusters
* [0] - distance to its HEAD lcluster
* [1] - distance to the next HEAD lcluster
*/
- __le16 delta[2];
+ __le16 delta[2]; /* for the NONHEAD lclusters */
} di_u;
};
-#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \
- (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \
- sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING)
-
-/* dirent sorts in alphabet order, thus we can do binary search */
-struct erofs_dirent {
- __le64 nid; /* node number */
- __le16 nameoff; /* start offset of file name */
- __u8 file_type; /* file type */
- __u8 reserved; /* reserved */
-} __packed;
-
-/*
- * EROFS file types should match generic FT_* types and
- * it seems no need to add BUILD_BUG_ONs since potential
- * unmatchness will break other fses as well...
- */
+#define Z_EROFS_MAP_HEADER_END(end) \
+ (ALIGN(end, 8) + sizeof(struct z_erofs_map_header))
+#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8)
+
+#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27)
+#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28
+#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1)
+struct z_erofs_extent {
+ __le32 plen; /* encoded length */
+ __le32 pstart_lo; /* physical offset */
+ __le32 pstart_hi; /* physical offset MSB */
+ __le32 lstart_lo; /* logical offset */
+ __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */
+ __u8 reserved[12]; /* for future use */
+};
-#define EROFS_NAME_LEN 255
+static inline int z_erofs_extent_recsize(unsigned int advise)
+{
+ return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) &
+ Z_EROFS_ADVISE_EXTRECSZ_MASK);
+}
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
@@ -427,7 +443,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
.h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
};
- BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+ BUILD_BUG_ON(sizeof(struct erofs_super_block) != 144);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
@@ -435,15 +451,13 @@ static inline void erofs_check_ondisk_layout_definitions(void)
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4);
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
- BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
+ BUILD_BUG_ON(sizeof(struct z_erofs_lcluster_index) != 8);
BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
/* keep in sync between 2 index structures for better extendibility */
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
- sizeof(struct z_erofs_vle_decompressed_index));
+ sizeof(struct z_erofs_lcluster_index));
BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
- BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
- Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
/* exclude old compiler versions like gcc 7.5.0 */
BUILD_BUG_ON(__builtin_constant_p(fmh) ?
fmh != cpu_to_le64(1ULL << 63) : 0);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
new file mode 100644
index 000000000000..932e8b353ba1
--- /dev/null
+++ b/fs/erofs/fileio.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include "internal.h"
+#include <trace/events/erofs.h>
+
+struct erofs_fileio_rq {
+ struct bio_vec bvecs[16];
+ struct bio bio;
+ struct kiocb iocb;
+ struct super_block *sb;
+};
+
+struct erofs_fileio {
+ struct erofs_map_blocks map;
+ struct erofs_map_dev dev;
+ struct erofs_fileio_rq *rq;
+};
+
+static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
+{
+ struct erofs_fileio_rq *rq =
+ container_of(iocb, struct erofs_fileio_rq, iocb);
+ struct folio_iter fi;
+
+ if (ret > 0) {
+ if (ret != rq->bio.bi_iter.bi_size) {
+ bio_advance(&rq->bio, ret);
+ zero_fill_bio(&rq->bio);
+ }
+ ret = 0;
+ }
+ if (rq->bio.bi_end_io) {
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
+ } else {
+ bio_for_each_folio_all(fi, &rq->bio) {
+ DBG_BUGON(folio_test_uptodate(fi.folio));
+ erofs_onlinefolio_end(fi.folio, ret, false);
+ }
+ }
+ bio_endio(&rq->bio);
+ bio_uninit(&rq->bio);
+ kfree(rq);
+}
+
+static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
+{
+ struct iov_iter iter;
+ int ret;
+
+ if (!rq)
+ return;
+ rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ rq->iocb.ki_ioprio = get_current_ioprio();
+ rq->iocb.ki_complete = erofs_fileio_ki_complete;
+ if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) &&
+ rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT)
+ rq->iocb.ki_flags = IOCB_DIRECT;
+ iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
+ rq->bio.bi_iter.bi_size);
+ scoped_with_creds(rq->iocb.ki_filp->f_cred)
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ if (ret != -EIOCBQUEUED)
+ erofs_fileio_ki_complete(&rq->iocb, ret);
+}
+
+static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
+{
+ struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ);
+ rq->iocb.ki_filp = mdev->m_dif->file;
+ rq->sb = mdev->m_sb;
+ return rq;
+}
+
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev)
+{
+ return &erofs_fileio_rq_alloc(mdev)->bio;
+}
+
+void erofs_fileio_submit_bio(struct bio *bio)
+{
+ return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq,
+ bio));
+}
+
+static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
+{
+ struct inode *inode = folio_inode(folio);
+ struct erofs_map_blocks *map = &io->map;
+ unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
+ loff_t pos = folio_pos(folio), ofs;
+ int err = 0;
+
+ erofs_onlinefolio_init(folio);
+ while (cur < end) {
+ if (!in_range(pos + cur, map->m_la, map->m_llen)) {
+ map->m_la = pos + cur;
+ map->m_llen = end - cur;
+ err = erofs_map_blocks(inode, map);
+ if (err)
+ break;
+ }
+
+ ofs = folio_pos(folio) + cur - map->m_la;
+ len = min_t(loff_t, map->m_llen - ofs, end - cur);
+ if (map->m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *src;
+
+ src = erofs_read_metabuf(&buf, inode->i_sb,
+ map->m_pa + ofs, erofs_inode_in_metabox(inode));
+ if (IS_ERR(src)) {
+ err = PTR_ERR(src);
+ break;
+ }
+ memcpy_to_folio(folio, cur, src, len);
+ erofs_put_metabuf(&buf);
+ } else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, cur + len);
+ attached = 0;
+ } else {
+ if (io->rq && (map->m_pa + ofs != io->dev.m_pa ||
+ map->m_deviceid != io->dev.m_deviceid)) {
+io_retry:
+ erofs_fileio_rq_submit(io->rq);
+ io->rq = NULL;
+ }
+
+ if (!io->rq) {
+ io->dev = (struct erofs_map_dev) {
+ .m_pa = io->map.m_pa + ofs,
+ .m_deviceid = io->map.m_deviceid,
+ };
+ err = erofs_map_dev(inode->i_sb, &io->dev);
+ if (err)
+ break;
+ io->rq = erofs_fileio_rq_alloc(&io->dev);
+ io->rq->bio.bi_iter.bi_sector =
+ (io->dev.m_dif->fsoff + io->dev.m_pa) >> 9;
+ attached = 0;
+ }
+ if (!bio_add_folio(&io->rq->bio, folio, len, cur))
+ goto io_retry;
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
+ io->dev.m_pa += len;
+ }
+ cur += len;
+ }
+ erofs_onlinefolio_end(folio, err, false);
+ return err;
+}
+
+static int erofs_fileio_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fileio io = {};
+ int err;
+
+ trace_erofs_read_folio(folio, true);
+ err = erofs_fileio_scan_folio(&io, folio);
+ erofs_fileio_rq_submit(io.rq);
+ return err;
+}
+
+static void erofs_fileio_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct erofs_fileio io = {};
+ struct folio *folio;
+ int err;
+
+ trace_erofs_readahead(inode, readahead_index(rac),
+ readahead_count(rac), true);
+ while ((folio = readahead_folio(rac))) {
+ err = erofs_fileio_scan_folio(&io, folio);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
+ }
+ erofs_fileio_rq_submit(io.rq);
+}
+
+const struct address_space_operations erofs_fileio_aops = {
+ .read_folio = erofs_fileio_read_folio,
+ .readahead = erofs_fileio_readahead,
+};
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 014e20962376..7a346e20f7b7 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -3,17 +3,37 @@
* Copyright (C) 2022, Alibaba Cloud
* Copyright (C) 2022, Bytedance Inc. All rights reserved.
*/
+#include <linux/pseudo_fs.h>
#include <linux/fscache.h>
#include "internal.h"
static DEFINE_MUTEX(erofs_domain_list_lock);
static DEFINE_MUTEX(erofs_domain_cookies_lock);
static LIST_HEAD(erofs_domain_list);
+static LIST_HEAD(erofs_domain_cookies_list);
static struct vfsmount *erofs_pseudo_mnt;
-struct erofs_fscache_request {
- struct erofs_fscache_request *primary;
- struct netfs_cache_resources cache_resources;
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type erofs_anon_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "pseudo_erofs",
+ .init_fs_context = erofs_anon_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct erofs_fscache_io {
+ struct netfs_cache_resources cres;
+ struct iov_iter iter;
+ netfs_io_terminated_t end_io;
+ void *private;
+ refcount_t ref;
+};
+
+struct erofs_fscache_rq {
struct address_space *mapping; /* The mapping being accessed */
loff_t start; /* Start position */
size_t len; /* Length of the request */
@@ -22,44 +42,17 @@ struct erofs_fscache_request {
refcount_t ref;
};
-static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
- loff_t start, size_t len)
+static bool erofs_fscache_io_put(struct erofs_fscache_io *io)
{
- struct erofs_fscache_request *req;
-
- req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->mapping = mapping;
- req->start = start;
- req->len = len;
- refcount_set(&req->ref, 1);
-
- return req;
-}
-
-static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
- size_t len)
-{
- struct erofs_fscache_request *req;
-
- /* use primary request for the first submission */
- if (!primary->submitted) {
- refcount_inc(&primary->ref);
- return primary;
- }
-
- req = erofs_fscache_req_alloc(primary->mapping,
- primary->start + primary->submitted, len);
- if (!IS_ERR(req)) {
- req->primary = primary;
- refcount_inc(&primary->ref);
- }
- return req;
+ if (!refcount_dec_and_test(&io->ref))
+ return false;
+ if (io->cres.ops)
+ io->cres.ops->end_operation(&io->cres);
+ kfree(io);
+ return true;
}
-static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
+static void erofs_fscache_req_complete(struct erofs_fscache_rq *req)
{
struct folio *folio;
bool failed = req->error;
@@ -79,169 +72,231 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
rcu_read_unlock();
}
-static void erofs_fscache_req_put(struct erofs_fscache_request *req)
+static void erofs_fscache_req_put(struct erofs_fscache_rq *req)
{
- if (refcount_dec_and_test(&req->ref)) {
- if (req->cache_resources.ops)
- req->cache_resources.ops->end_operation(&req->cache_resources);
- if (!req->primary)
- erofs_fscache_req_complete(req);
- else
- erofs_fscache_req_put(req->primary);
- kfree(req);
- }
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ erofs_fscache_req_complete(req);
+ kfree(req);
}
-static void erofs_fscache_subreq_complete(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping,
+ loff_t start, size_t len)
{
- struct erofs_fscache_request *req = priv;
+ struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (IS_ERR_VALUE(transferred_or_error)) {
- if (req->primary)
- req->primary->error = transferred_or_error;
- else
- req->error = transferred_or_error;
- }
- erofs_fscache_req_put(req);
+ if (!req)
+ return NULL;
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
+ return req;
+}
+
+static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
+{
+ struct erofs_fscache_rq *req = io->private;
+
+ if (erofs_fscache_io_put(io))
+ erofs_fscache_req_put(req);
+}
+
+static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error)
+{
+ struct erofs_fscache_io *io = priv;
+ struct erofs_fscache_rq *req = io->private;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ req->error = transferred_or_error;
+ erofs_fscache_req_io_put(io);
+}
+
+static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req)
+{
+ struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL);
+
+ if (!io)
+ return NULL;
+ io->end_io = erofs_fscache_req_end_io;
+ io->private = req;
+ refcount_inc(&req->ref);
+ refcount_set(&io->ref, 1);
+ return io;
}
/*
- * Read data from fscache (cookie, pstart, len), and fill the read data into
- * page cache described by (req->mapping, lstart, len). @pstart describeis the
- * start physical address in the cache file.
+ * Read data from fscache described by cookie at pstart physical address
+ * offset, and fill the read data into buffer described by io->iter.
*/
-static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct erofs_fscache_request *req, loff_t pstart, size_t len)
+static int erofs_fscache_read_io_async(struct fscache_cookie *cookie,
+ loff_t pstart, struct erofs_fscache_io *io)
{
enum netfs_io_source source;
- struct super_block *sb = req->mapping->host->i_sb;
- struct netfs_cache_resources *cres = &req->cache_resources;
- struct iov_iter iter;
- loff_t lstart = req->start + req->submitted;
- size_t done = 0;
+ struct netfs_cache_resources *cres = &io->cres;
+ struct iov_iter *iter = &io->iter;
int ret;
- DBG_BUGON(len > req->len - req->submitted);
-
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
return ret;
- while (done < len) {
- loff_t sstart = pstart + done;
- size_t slen = len - done;
+ while (iov_iter_count(iter)) {
+ size_t orig_count = iov_iter_count(iter), len = orig_count;
unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
source = cres->ops->prepare_ondemand_read(cres,
- sstart, &slen, LLONG_MAX, &flags, 0);
- if (WARN_ON(slen == 0))
+ pstart, &len, LLONG_MAX, &flags, 0);
+ if (WARN_ON(len == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ erofs_err(NULL, "prepare_ondemand_read failed (source %d)", source);
return -EIO;
}
- refcount_inc(&req->ref);
- iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
- lstart + done, slen);
-
- ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
- erofs_fscache_subreq_complete, req);
+ iov_iter_truncate(iter, len);
+ refcount_inc(&io->ref);
+ ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL,
+ io->end_io, io);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
- erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ erofs_err(NULL, "fscache_read failed (ret %d)", ret);
return ret;
}
+ if (WARN_ON(iov_iter_count(iter)))
+ return -EIO;
- done += slen;
+ iov_iter_reexpand(iter, orig_count - len);
+ pstart += len;
}
- DBG_BUGON(done != len);
return 0;
}
-static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+struct erofs_fscache_bio {
+ struct erofs_fscache_io io;
+ struct bio bio; /* w/o bdev to share bio_add_page/endio() */
+ struct bio_vec bvecs[BIO_MAX_VECS];
+};
+
+static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error)
+{
+ struct erofs_fscache_bio *io = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ io->bio.bi_status = errno_to_blk_status(transferred_or_error);
+ bio_endio(&io->bio);
+ BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0);
+ erofs_fscache_io_put(&io->io);
+}
+
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
{
+ struct erofs_fscache_bio *io;
+
+ io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
+ bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+ io->io.private = mdev->m_dif->fscache->cookie;
+ io->io.end_io = erofs_fscache_bio_endio;
+ refcount_set(&io->io.ref, 1);
+ return &io->bio;
+}
+
+void erofs_fscache_submit_bio(struct bio *bio)
+{
+ struct erofs_fscache_bio *io = container_of(bio,
+ struct erofs_fscache_bio, bio);
int ret;
- struct super_block *sb = folio_mapping(folio)->host->i_sb;
- struct erofs_fscache_request *req;
- struct erofs_map_dev mdev = {
- .m_deviceid = 0,
- .m_pa = folio_pos(folio),
- };
- ret = erofs_map_dev(sb, &mdev);
- if (ret) {
+ iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt,
+ bio->bi_iter.bi_size);
+ ret = erofs_fscache_read_io_async(io->io.private,
+ bio->bi_iter.bi_sector << 9, &io->io);
+ erofs_fscache_io_put(&io->io);
+ if (!ret)
+ return;
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
+ struct erofs_fscache *ctx = folio->mapping->host->i_private;
+ int ret = -ENOMEM;
+ struct erofs_fscache_rq *req;
+ struct erofs_fscache_io *io;
+
+ req = erofs_fscache_req_alloc(folio->mapping,
+ folio_pos(folio), folio_size(folio));
+ if (!req) {
folio_unlock(folio);
return ret;
}
- req = erofs_fscache_req_alloc(folio_mapping(folio),
- folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
- folio_unlock(folio);
- return PTR_ERR(req);
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io) {
+ req->error = ret;
+ goto out;
}
+ iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages,
+ folio_pos(folio), folio_size(folio));
- ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- req, mdev.m_pa, folio_size(folio));
+ ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io);
if (ret)
req->error = ret;
+ erofs_fscache_req_io_put(io);
+out:
erofs_fscache_req_put(req);
return ret;
}
-static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
{
- struct address_space *mapping = primary->mapping;
+ struct address_space *mapping = req->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
- struct erofs_fscache_request *req;
+ struct erofs_fscache_io *io;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
- struct iov_iter iter;
- loff_t pos = primary->start + primary->submitted;
+ loff_t pos = req->start + req->submitted;
size_t count;
int ret;
map.m_la = pos;
- ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ ret = erofs_map_blocks(inode, &map);
if (ret)
return ret;
if (map.m_flags & EROFS_MAP_META) {
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- erofs_blk_t blknr;
- size_t offset, size;
+ struct iov_iter iter;
+ size_t size = map.m_llen;
void *src;
- /* For tail packing layout, the offset may be non-zero. */
- offset = erofs_blkoff(map.m_pa);
- blknr = erofs_blknr(map.m_pa);
- size = map.m_llen;
-
- src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(src))
return PTR_ERR(src);
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
- if (copy_to_iter(src + offset, size, &iter) != size) {
+ if (copy_to_iter(src, size, &iter) != size) {
erofs_put_metabuf(&buf);
return -EFAULT;
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- primary->submitted += PAGE_SIZE;
+ req->submitted += PAGE_SIZE;
return 0;
}
- count = primary->len - primary->submitted;
+ count = req->len - req->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ struct iov_iter iter;
+
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- primary->submitted += count;
+ req->submitted += count;
return 0;
}
@@ -256,18 +311,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
if (ret)
return ret;
- req = erofs_fscache_req_chain(primary, count);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ io = erofs_fscache_req_io_alloc(req);
+ if (!io)
+ return -ENOMEM;
+ iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
+ ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie,
+ mdev.m_pa + (pos - map.m_la), io);
+ erofs_fscache_req_io_put(io);
- ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- req, mdev.m_pa + (pos - map.m_la), count);
- erofs_fscache_req_put(req);
- primary->submitted += count;
+ req->submitted += count;
return ret;
}
-static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+static int erofs_fscache_data_read(struct erofs_fscache_rq *req)
{
int ret;
@@ -276,20 +332,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req)
if (ret)
req->error = ret;
} while (!ret && req->submitted < req->len);
-
return ret;
}
static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
int ret;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
- if (IS_ERR(req)) {
+ if (!req) {
folio_unlock(folio);
- return PTR_ERR(req);
+ return -ENOMEM;
}
ret = erofs_fscache_data_read(req);
@@ -299,14 +354,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct erofs_fscache_request *req;
+ struct erofs_fscache_rq *req;
if (!readahead_count(rac))
return;
req = erofs_fscache_req_alloc(rac->mapping,
readahead_pos(rac), readahead_length(rac));
- if (IS_ERR(req))
+ if (!req)
return;
/* The request completion will drop refs on the folios. */
@@ -328,8 +383,6 @@ const struct address_space_operations erofs_fscache_access_aops = {
static void erofs_fscache_domain_put(struct erofs_domain *domain)
{
- if (!domain)
- return;
mutex_lock(&erofs_domain_list_lock);
if (refcount_dec_and_test(&domain->ref)) {
list_del(&domain->list);
@@ -337,8 +390,8 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain)
kern_unmount(erofs_pseudo_mnt);
erofs_pseudo_mnt = NULL;
}
- mutex_unlock(&erofs_domain_list_lock);
fscache_relinquish_volume(domain->volume, NULL, false);
+ mutex_unlock(&erofs_domain_list_lock);
kfree(domain->domain_id);
kfree(domain);
return;
@@ -392,11 +445,12 @@ static int erofs_fscache_init_domain(struct super_block *sb)
goto out;
if (!erofs_pseudo_mnt) {
- erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
- if (IS_ERR(erofs_pseudo_mnt)) {
- err = PTR_ERR(erofs_pseudo_mnt);
+ struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto out;
}
+ erofs_pseudo_mnt = mnt;
}
domain->volume = sbi->volume;
@@ -431,19 +485,21 @@ static int erofs_fscache_register_domain(struct super_block *sb)
return err;
}
-static
-struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
- char *name,
- unsigned int flags)
+static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
{
struct fscache_volume *volume = EROFS_SB(sb)->volume;
struct erofs_fscache *ctx;
struct fscache_cookie *cookie;
+ struct super_block *isb;
+ struct inode *inode;
int ret;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&ctx->node);
+ refcount_set(&ctx->ref, 1);
cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
name, strlen(name), NULL, 0, 0);
@@ -452,32 +508,33 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
ret = -EINVAL;
goto err;
}
-
fscache_use_cookie(cookie, false);
- ctx->cookie = cookie;
-
- if (flags & EROFS_REG_COOKIE_NEED_INODE) {
- struct inode *const inode = new_inode(sb);
-
- if (!inode) {
- erofs_err(sb, "failed to get anon inode for %s", name);
- ret = -ENOMEM;
- goto err_cookie;
- }
-
- set_nlink(inode, 1);
- inode->i_size = OFFSET_MAX;
- inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- ctx->inode = inode;
+ /*
+ * Allocate anonymous inode in global pseudo mount for shareable blobs,
+ * so that they are accessible among erofs fs instances.
+ */
+ isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb;
+ inode = new_inode(isb);
+ if (!inode) {
+ erofs_err(sb, "failed to get anon inode for %s", name);
+ ret = -ENOMEM;
+ goto err_cookie;
}
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+ inode->i_blkbits = EROFS_SB(sb)->blkszbits;
+ inode->i_private = ctx;
+
+ ctx->cookie = cookie;
+ ctx->inode = inode;
return ctx;
err_cookie:
- fscache_unuse_cookie(ctx->cookie, NULL, NULL);
- fscache_relinquish_cookie(ctx->cookie, false);
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ fscache_relinquish_cookie(cookie, false);
err:
kfree(ctx);
return ERR_PTR(ret);
@@ -492,13 +549,9 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
kfree(ctx);
}
-static
-struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
- char *name,
- unsigned int flags)
+static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
{
- int err;
- struct inode *inode;
struct erofs_fscache *ctx;
struct erofs_domain *domain = EROFS_SB(sb)->domain;
@@ -508,55 +561,38 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
ctx->name = kstrdup(name, GFP_KERNEL);
if (!ctx->name) {
- err = -ENOMEM;
- goto out;
- }
-
- inode = new_inode(erofs_pseudo_mnt->mnt_sb);
- if (!inode) {
- err = -ENOMEM;
- goto out;
+ erofs_fscache_relinquish_cookie(ctx);
+ return ERR_PTR(-ENOMEM);
}
- ctx->domain = domain;
- ctx->anon_inode = inode;
- inode->i_private = ctx;
refcount_inc(&domain->ref);
+ ctx->domain = domain;
+ list_add(&ctx->node, &erofs_domain_cookies_list);
return ctx;
-out:
- erofs_fscache_relinquish_cookie(ctx);
- return ERR_PTR(err);
}
-static
-struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
- char *name,
- unsigned int flags)
+static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+ char *name, unsigned int flags)
{
- struct inode *inode;
struct erofs_fscache *ctx;
struct erofs_domain *domain = EROFS_SB(sb)->domain;
- struct super_block *psb = erofs_pseudo_mnt->mnt_sb;
+ flags |= EROFS_REG_COOKIE_SHARE;
mutex_lock(&erofs_domain_cookies_lock);
- spin_lock(&psb->s_inode_list_lock);
- list_for_each_entry(inode, &psb->s_inodes, i_sb_list) {
- ctx = inode->i_private;
- if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
+ list_for_each_entry(ctx, &erofs_domain_cookies_list, node) {
+ if (ctx->domain != domain || strcmp(ctx->name, name))
continue;
if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
- igrab(inode);
+ refcount_inc(&ctx->ref);
} else {
erofs_err(sb, "%s already exists in domain %s", name,
domain->domain_id);
ctx = ERR_PTR(-EEXIST);
}
- spin_unlock(&psb->s_inode_list_lock);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
- spin_unlock(&psb->s_inode_list_lock);
- ctx = erofs_fscache_domain_init_cookie(sb, name, flags);
+ ctx = erofs_domain_init_cookie(sb, name, flags);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
@@ -572,23 +608,22 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
{
- bool drop;
- struct erofs_domain *domain;
+ struct erofs_domain *domain = NULL;
if (!ctx)
return;
- domain = ctx->domain;
- if (domain) {
- mutex_lock(&erofs_domain_cookies_lock);
- drop = atomic_read(&ctx->anon_inode->i_count) == 1;
- iput(ctx->anon_inode);
- mutex_unlock(&erofs_domain_cookies_lock);
- if (!drop)
- return;
- }
+ if (!ctx->domain)
+ return erofs_fscache_relinquish_cookie(ctx);
- erofs_fscache_relinquish_cookie(ctx);
- erofs_fscache_domain_put(domain);
+ mutex_lock(&erofs_domain_cookies_lock);
+ if (refcount_dec_and_test(&ctx->ref)) {
+ domain = ctx->domain;
+ list_del(&ctx->node);
+ erofs_fscache_relinquish_cookie(ctx);
+ }
+ mutex_unlock(&erofs_domain_cookies_lock);
+ if (domain)
+ erofs_fscache_domain_put(domain);
}
int erofs_fscache_register_fs(struct super_block *sb)
@@ -596,7 +631,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
int ret;
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
- unsigned int flags;
+ unsigned int flags = 0;
if (sbi->domain_id)
ret = erofs_fscache_register_domain(sb);
@@ -615,14 +650,13 @@ int erofs_fscache_register_fs(struct super_block *sb)
*
* Acquired domain/volume will be relinquished in kill_sb() on error.
*/
- flags = EROFS_REG_COOKIE_NEED_INODE;
if (sbi->domain_id)
flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
- sbi->s_fscache = fscache;
+ sbi->dif0.fscache = fscache;
return 0;
}
@@ -630,14 +664,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- erofs_fscache_unregister_cookie(sbi->s_fscache);
+ erofs_fscache_unregister_cookie(sbi->dif0.fscache);
if (sbi->domain)
erofs_fscache_domain_put(sbi->domain);
else
fscache_relinquish_volume(sbi->volume, NULL, false);
- sbi->s_fscache = NULL;
+ sbi->dif0.fscache = NULL;
sbi->volume = NULL;
sbi->domain = NULL;
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d3b8736fa124..bce98c845a18 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,42 +5,54 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
-
+#include <linux/compat.h>
#include <trace/events/erofs.h>
-static void *erofs_read_inode(struct erofs_buf *buf,
- struct inode *inode, unsigned int *ofs)
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
+ unsigned int m_pofs)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ loff_t off;
+
+ m_pofs += vi->xattr_isize;
+ /* check if it cannot be handled with fast symlink scheme */
+ if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+ check_add_overflow(m_pofs, inode->i_size, &off) ||
+ off > i_blocksize(inode))
+ return 0;
+
+ inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL);
+ return inode->i_link ? 0 : -ENOMEM;
+}
+
+static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode));
+ unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode));
+ bool in_mbox = erofs_inode_in_metabox(inode);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ erofs_blk_t addrmask = BIT_ULL(48) - 1;
struct erofs_inode *vi = EROFS_I(inode);
- const erofs_off_t inode_loc = iloc(sbi, vi->nid);
-
- erofs_blk_t blkaddr, nblks = 0;
- void *kaddr;
+ struct erofs_inode_extended *die, copied;
struct erofs_inode_compact *dic;
- struct erofs_inode_extended *die, *copied = NULL;
unsigned int ifmt;
- int err;
-
- blkaddr = erofs_blknr(inode_loc);
- *ofs = erofs_blkoff(inode_loc);
-
- erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
- __func__, vi->nid, *ofs, blkaddr);
+ void *ptr;
+ int err = 0;
- kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(kaddr));
- return kaddr;
+ ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), in_mbox);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to read inode meta block (nid: %llu): %d",
+ vi->nid, err);
+ goto err_out;
}
- dic = kaddr + *ofs;
+ dic = ptr + ofs;
ifmt = le16_to_cpu(dic->i_format);
-
if (ifmt & ~EROFS_I_ALL) {
- erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ erofs_err(sb, "unsupported i_format %u of nid %llu",
ifmt, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -48,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
- erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+ erofs_err(sb, "unsupported datalayout %u of nid %llu",
vi->datalayout, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
@@ -58,212 +70,151 @@ static void *erofs_read_inode(struct erofs_buf *buf,
case EROFS_INODE_LAYOUT_EXTENDED:
vi->inode_isize = sizeof(struct erofs_inode_extended);
/* check if the extended inode acrosses block boundary */
- if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) {
- *ofs += vi->inode_isize;
+ if (ofs + vi->inode_isize <= sb->s_blocksize) {
+ ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
+ copied.i_u = die->i_u;
+ copied.i_nb = die->i_nb;
} else {
- const unsigned int gotten = EROFS_BLKSIZ - *ofs;
-
- copied = kmalloc(vi->inode_isize, GFP_NOFS);
- if (!copied) {
- err = -ENOMEM;
+ const unsigned int gotten = sb->s_blocksize - ofs;
+
+ memcpy(&copied, dic, gotten);
+ ptr = erofs_read_metabuf(&buf, sb,
+ erofs_pos(sb, blkaddr + 1), in_mbox);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to read inode payload block (nid: %llu): %d",
+ vi->nid, err);
goto err_out;
}
- memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
- EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
- vi->nid, PTR_ERR(kaddr));
- kfree(copied);
- return kaddr;
- }
- *ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, *ofs);
- die = copied;
+ ofs = vi->inode_isize - gotten;
+ memcpy((u8 *)&copied + gotten, ptr, ofs);
+ die = &copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(die->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
-
- /* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(die->i_mtime),
+ le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
-
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(die->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- /* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(die->i_u.c.format);
- kfree(copied);
- copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
- *ofs += vi->inode_isize;
+ ofs += vi->inode_isize;
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
- break;
- case S_IFCHR:
- case S_IFBLK:
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(dic->i_u.rdev));
- break;
- case S_IFIFO:
- case S_IFSOCK:
- inode->i_rdev = 0;
- break;
- default:
- goto bogusimode;
- }
+ copied.i_u = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
- set_nlink(inode, le16_to_cpu(dic->i_nlink));
-
- /* use build time for compact inodes */
- inode->i_ctime.tv_sec = sbi->build_time;
- inode->i_ctime.tv_nsec = sbi->build_time_nsec;
+ if (!S_ISDIR(inode->i_mode) &&
+ ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) {
+ set_nlink(inode, 1);
+ copied.i_nb = dic->i_nb;
+ } else {
+ set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
+ copied.i_nb.startblk_hi = 0;
+ addrmask = BIT_ULL(32) - 1;
+ }
+ inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime),
+ sbi->fixed_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
- if (erofs_inode_is_data_compressed(vi->datalayout))
- nblks = le32_to_cpu(dic->i_u.compressed_blocks);
- else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
- vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
- erofs_err(inode->i_sb,
- "unsupported on-disk inode version %u of nid %llu",
+ erofs_err(sb, "unsupported on-disk inode version %u of nid %llu",
erofs_inode_version(ifmt), vi->nid);
err = -EOPNOTSUPP;
goto err_out;
}
+ if (unlikely(inode->i_size < 0)) {
+ erofs_err(sb, "negative i_size @ nid %llu", vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1;
+ fallthrough;
+ case S_IFREG:
+ case S_IFLNK:
+ vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) |
+ ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32);
+ if (vi->datalayout == EROFS_INODE_FLAT_PLAIN &&
+ !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask))
+ vi->startblk = EROFS_NULL_ADDR;
+
+ if(S_ISLNK(inode->i_mode)) {
+ err = erofs_fill_symlink(inode, ptr, ofs);
+ if (err)
+ goto err_out;
+ }
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode,
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) <<
+ (sb->s_blocksize_bits - 9);
+ else
+ inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
+
if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(copied.i_u.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
- erofs_err(inode->i_sb,
- "unsupported chunk format %x of nid %llu",
+ erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
}
- vi->chunkbits = LOG_BLOCK_SIZE +
+ vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
- inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+ inode_set_atime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, inode_get_mtime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
- vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+ (vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
+ vi->datalayout == EROFS_INODE_CHUNK_BASED))
inode->i_flags |= S_DAX;
- if (!nblks)
- /* measure inode.i_blocks as generic filesystems */
- inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
- else
- inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
- return kaddr;
-
-bogusimode:
- erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
- inode->i_mode, vi->nid);
- err = -EFSCORRUPTED;
err_out:
- DBG_BUGON(1);
- kfree(copied);
- erofs_put_metabuf(buf);
- return ERR_PTR(err);
-}
-
-static int erofs_fill_symlink(struct inode *inode, void *kaddr,
- unsigned int m_pofs)
-{
- struct erofs_inode *vi = EROFS_I(inode);
- char *lnk;
-
- /* if it cannot be handled with fast symlink scheme */
- if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
- inode->i_op = &erofs_symlink_iops;
- return 0;
- }
-
- lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
- if (!lnk)
- return -ENOMEM;
-
- m_pofs += vi->xattr_isize;
- /* inline symlink data shouldn't cross block boundary */
- if (m_pofs + inode->i_size > EROFS_BLKSIZ) {
- kfree(lnk);
- erofs_err(inode->i_sb,
- "inline data cross block boundary @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- memcpy(lnk, kaddr + m_pofs, inode->i_size);
- lnk[inode->i_size] = '\0';
-
- inode->i_link = lnk;
- inode->i_op = &erofs_fast_symlink_iops;
- return 0;
+ erofs_put_metabuf(&buf);
+ return err;
}
static int erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
- unsigned int ofs;
- int err = 0;
+ int err;
trace_erofs_fill_inode(inode);
+ err = erofs_read_inode(inode);
+ if (err)
+ return err;
- /* read inode base data from disk */
- kaddr = erofs_read_inode(&buf, inode, &ofs);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
-
- /* setup the new inode */
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- if (erofs_inode_is_data_compressed(vi->datalayout))
- inode->i_fop = &generic_ro_fops;
- else
- inode->i_fop = &erofs_file_fops;
+ inode->i_fop = &erofs_file_fops;
break;
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
@@ -271,105 +222,157 @@ static int erofs_fill_inode(struct inode *inode)
inode_nohighmem(inode);
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, kaddr, ofs);
- if (err)
- goto out_unlock;
+ if (inode->i_link)
+ inode->i_op = &erofs_fast_symlink_iops;
+ else
+ inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
+ default:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
- goto out_unlock;
- default:
- err = -EFSCORRUPTED;
- goto out_unlock;
+ return 0;
}
- if (erofs_inode_is_data_compressed(vi->datalayout)) {
- if (!erofs_is_fscache_mode(inode->i_sb))
- err = z_erofs_fill_inode(inode);
- else
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
- inode->i_mapping->a_ops = &erofs_raw_access_aops;
mapping_set_large_folios(inode->i_mapping);
+ if (erofs_inode_is_data_compressed(vi->datalayout)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+ DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
+ erofs_info, inode->i_sb,
+ "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
+ inode->i_mapping->a_ops = &z_erofs_aops;
+#else
+ err = -EOPNOTSUPP;
+#endif
+ } else {
+ inode->i_mapping->a_ops = &erofs_aops;
#ifdef CONFIG_EROFS_FS_ONDEMAND
- if (erofs_is_fscache_mode(inode->i_sb))
- inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb)))
+ inode->i_mapping->a_ops = &erofs_fileio_aops;
#endif
+ }
-out_unlock:
- erofs_put_metabuf(&buf);
return err;
}
/*
- * erofs nid is 64bits, but i_ino is 'unsigned long', therefore
- * we should do more for 32-bit platform to find the right inode.
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
*/
-static int erofs_ilookup_test_actor(struct inode *inode, void *opaque)
+static ino_t erofs_squash_ino(struct super_block *sb, erofs_nid_t nid)
{
- const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+ u64 ino64 = erofs_nid_to_ino64(EROFS_SB(sb), nid);
- return EROFS_I(inode)->nid == nid;
+ if (sizeof(ino_t) < sizeof(erofs_nid_t))
+ ino64 ^= ino64 >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
+ return (ino_t)ino64;
}
-static int erofs_iget_set_actor(struct inode *inode, void *opaque)
+static int erofs_iget5_eq(struct inode *inode, void *opaque)
+{
+ return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque;
+}
+
+static int erofs_iget5_set(struct inode *inode, void *opaque)
{
const erofs_nid_t nid = *(erofs_nid_t *)opaque;
- inode->i_ino = erofs_inode_hash(nid);
+ inode->i_ino = erofs_squash_ino(inode->i_sb, nid);
+ EROFS_I(inode)->nid = nid;
return 0;
}
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
- const unsigned long hashval = erofs_inode_hash(nid);
struct inode *inode;
- inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor,
- erofs_iget_set_actor, &nid);
+ inode = iget5_locked(sb, erofs_squash_ino(sb, nid), erofs_iget5_eq,
+ erofs_iget5_set, &nid);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- int err;
- struct erofs_inode *vi = EROFS_I(inode);
+ if (inode_state_read_once(inode) & I_NEW) {
+ int err = erofs_fill_inode(inode);
- vi->nid = nid;
-
- err = erofs_fill_inode(inode);
- if (!err) {
- unlock_new_inode(inode);
- } else {
+ if (err) {
iget_failed(inode);
- inode = ERR_PTR(err);
+ return ERR_PTR(err);
}
+ unlock_new_inode(inode);
}
return inode;
}
-int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ bool compressed =
+ erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
- if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+ if (compressed)
stat->attributes |= STATX_ATTR_COMPRESSED;
-
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(mnt_userns, inode, stat);
+ /*
+ * Return the DIO alignment restrictions if requested.
+ *
+ * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode
+ * and uncompressed inodes, otherwise we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN;
+ if (bdev && !compressed) {
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ }
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
+static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ int ret;
+
+ if (!sbi->volume_name)
+ ret = clear_user(arg, 1);
+ else
+ ret = copy_to_user(arg, sbi->volume_name,
+ strlen(sbi->volume_name));
+ return ret ? -EFAULT : 0;
+}
+
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case FS_IOC_GETFSLABEL:
+ return erofs_ioctl_get_volume_label(inode, argp);
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index bb8501c0ff5b..f7f622836198 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -8,34 +8,27 @@
#define __EROFS_INTERNAL_H
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/dcache.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/iomap.h>
#include "erofs_fs.h"
-/* redefine pr_fmt "erofs: " */
-#undef pr_fmt
-#define pr_fmt(fmt) "erofs: " fmt
-
-__printf(3, 4) void _erofs_err(struct super_block *sb,
- const char *function, const char *fmt, ...);
+__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
#define erofs_err(sb, fmt, ...) \
- _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
-__printf(3, 4) void _erofs_info(struct super_block *sb,
- const char *function, const char *fmt, ...);
+ _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__)
#define erofs_info(sb, fmt, ...) \
- _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+ _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__)
+
#ifdef CONFIG_EROFS_FS_DEBUG
-#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__)
#define DBG_BUGON BUG_ON
#else
-#define erofs_dbg(x, ...) ((void)0)
#define DBG_BUGON(x) ((void)(x))
#endif /* !CONFIG_EROFS_FS_DEBUG */
@@ -44,18 +37,17 @@ __printf(3, 4) void _erofs_info(struct super_block *sb,
typedef u64 erofs_nid_t;
typedef u64 erofs_off_t;
-/* data type for filesystem-wide blocks number */
-typedef u32 erofs_blk_t;
+typedef u64 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct block_device *bdev;
+ struct file *file;
struct dax_device *dax_dev;
- u64 dax_part_off;
+ u64 fsoff, dax_part_off;
- u32 blocks;
- u32 mapped_blkaddr;
+ erofs_blk_t blocks;
+ erofs_blk_t uniaddr;
};
enum {
@@ -65,15 +57,12 @@ enum {
};
struct erofs_mount_opts {
-#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
/* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
unsigned int sync_decompress;
-
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
-#endif
unsigned int mount_opt;
};
@@ -82,13 +71,7 @@ struct erofs_dev_context {
struct rw_semaphore rwsem;
unsigned int extra_devices;
-};
-
-struct erofs_fs_context {
- struct erofs_mount_opts opt;
- struct erofs_dev_context *devs;
- char *fsid;
- char *domain_id;
+ bool flatdev;
};
/* all filesystem-wide lz4 configurations */
@@ -108,13 +91,22 @@ struct erofs_domain {
struct erofs_fscache {
struct fscache_cookie *cookie;
- struct inode *inode;
- struct inode *anon_inode;
+ struct inode *inode; /* anonymous inode for the blob */
+
+ /* used for share domain mode */
struct erofs_domain *domain;
+ struct list_head node;
+ refcount_t ref;
char *name;
};
+struct erofs_xattr_prefix_item {
+ struct erofs_xattr_long_prefix *prefix;
+ u8 infix_len;
+};
+
struct erofs_sb_info {
+ struct erofs_device_info dif0;
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
@@ -131,44 +123,47 @@ struct erofs_sb_info {
struct inode *managed_cache;
struct erofs_sb_lz4_info lz4;
- struct inode *packed_inode;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct inode *packed_inode;
+ struct inode *metabox_inode;
struct erofs_dev_context *devs;
- struct dax_device *dax_dev;
- u64 dax_part_off;
u64 total_blocks;
- u32 primarydevice_blocks;
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
u32 xattr_blkaddr;
+ u32 xattr_prefix_start;
+ u8 xattr_prefix_count;
+ struct erofs_xattr_prefix_item *xattr_prefixes;
+ unsigned int xattr_filter_reserved;
#endif
u16 device_id_mask; /* valid bits of device id to be used */
- /* inode slot unit size in bit shift */
- unsigned char islotbits;
+ unsigned char islotbits; /* inode slot unit size in bit shift */
+ unsigned char blkszbits; /* filesystem block size in bit shift */
u32 sb_size; /* total superblock size */
- u32 build_time_nsec;
- u64 build_time;
+ u32 fixed_nsec;
+ s64 epoch;
/* what we really care is nid, rather than ino.. */
erofs_nid_t root_nid;
+ erofs_nid_t packed_nid;
+ erofs_nid_t metabox_nid;
/* used for statfs, f_files - f_favail */
u64 inos;
- u8 uuid[16]; /* 128-bit uuid for volume */
- u8 volume_name[16]; /* volume name */
+ char *volume_name;
u32 feature_compat;
u32 feature_incompat;
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+ erofs_off_t dir_ra_bytes;
/* fscache support */
struct fscache_volume *volume;
- struct erofs_fscache *s_fscache;
struct erofs_domain *domain;
char *fsid;
char *domain_id;
@@ -182,14 +177,21 @@ struct erofs_sb_info {
#define EROFS_MOUNT_POSIX_ACL 0x00000020
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
+#define EROFS_MOUNT_DIRECT_IO 0x00000100
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file;
+}
+
static inline bool erofs_is_fscache_mode(struct super_block *sb)
{
- return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) &&
+ !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev;
}
enum {
@@ -198,83 +200,19 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
-
-/* basic unit of the workstation of a super_block */
-struct erofs_workgroup {
- /* the workgroup index in the workstation */
- pgoff_t index;
-
- /* overall workgroup reference count */
- atomic_t refcount;
-};
-
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- /*
- * other observers should notice all modifications
- * in the freezing period.
- */
- smp_mb();
- atomic_set(&grp->refcount, orig_val);
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- return atomic_cond_read_relaxed(&grp->refcount,
- VAL != EROFS_LOCKED_MAGIC);
-}
-
-/* we strictly follow PAGE_SIZE and no buffer head yet */
-#define LOG_BLOCK_SIZE PAGE_SHIFT
-
-#undef LOG_SECTORS_PER_BLOCK
-#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9)
-
-#undef SECTORS_PER_BLOCK
-#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK)
-
-#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE)
-
-#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ)
-#error erofs cannot be used in this platform
-#endif
-
-enum erofs_kmap_type {
- EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap_local_page() to map the buffer */
-};
-
struct erofs_buf {
+ struct address_space *mapping;
+ struct file *file;
+ u64 off;
struct page *page;
void *base;
- enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define ROOT_NID(sb) ((sb)->root_nid)
-
-#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
-#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ)
-#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ)
-
-static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
-{
- return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
-}
+#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits))
+#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1))
+#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
+#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
#define EROFS_FEATURE_FUNCS(name, compat, feature) \
static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
@@ -291,7 +229,30 @@ EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
+EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
+EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT)
+EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
+EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
+
+static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+ if (!erofs_sb_has_metabox(sbi))
+ return nid;
+
+ /*
+ * When metadata compression is enabled, avoid generating excessively
+ * large inode numbers for metadata-compressed inodes. Shift NIDs in
+ * the 31-62 bit range left by one and move the metabox flag to bit 31.
+ *
+ * Note: on-disk NIDs remain unchanged as they are primarily used for
+ * compatibility with non-LFS 32-bit applications.
+ */
+ return ((nid << 1) & GENMASK_ULL(63, 32)) | (nid & GENMASK(30, 0)) |
+ ((nid >> EROFS_DIRENT_NID_METABOX_BIT) << 31);
+}
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
@@ -301,6 +262,9 @@ EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1)
#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2)
+/* default readahead size of directories */
+#define EROFS_DIR_RA_BYTES 16384
+
struct erofs_inode {
erofs_nid_t nid;
@@ -309,13 +273,15 @@ struct erofs_inode {
unsigned char datalayout;
unsigned char inode_isize;
- unsigned short xattr_isize;
+ bool dot_omitted;
+ unsigned int xattr_isize;
+ unsigned int xattr_name_filter;
unsigned int xattr_shared_count;
unsigned int *xattr_shared_xattrs;
union {
- erofs_blk_t raw_blkaddr;
+ erofs_blk_t startblk;
struct {
unsigned short chunkformat;
unsigned char chunkbits;
@@ -324,15 +290,13 @@ struct erofs_inode {
struct {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
- unsigned char z_logical_clusterbits;
- unsigned long z_tailextent_headlcn;
+ unsigned char z_lclusterbits;
union {
- struct {
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
- };
- erofs_off_t z_fragmentoff;
+ u64 z_tailextent_headlcn;
+ u64 z_extents;
};
+ erofs_off_t z_fragmentoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -340,73 +304,57 @@ struct erofs_inode {
struct inode vfs_inode;
};
-#define EROFS_I(ptr) \
- container_of(ptr, struct erofs_inode, vfs_inode)
+#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode)
-static inline unsigned long erofs_inode_datablocks(struct inode *inode)
+static inline bool erofs_inode_in_metabox(struct inode *inode)
{
- /* since i_size cannot be changed */
- return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+ return EROFS_I(inode)->nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT);
}
-static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit,
- unsigned int bits)
+static inline erofs_off_t erofs_iloc(struct inode *inode)
{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ erofs_nid_t nid_lo = EROFS_I(inode)->nid & EROFS_DIRENT_NID_MASK;
- return (value >> bit) & ((1 << bits) - 1);
+ if (erofs_inode_in_metabox(inode))
+ return nid_lo << sbi->islotbits;
+ return erofs_pos(inode->i_sb, sbi->meta_blkaddr) +
+ (nid_lo << sbi->islotbits);
}
-
-static inline unsigned int erofs_inode_version(unsigned int value)
+static inline unsigned int erofs_inode_version(unsigned int ifmt)
{
- return erofs_bitrange(value, EROFS_I_VERSION_BIT,
- EROFS_I_VERSION_BITS);
+ return (ifmt >> EROFS_I_VERSION_BIT) & EROFS_I_VERSION_MASK;
}
-static inline unsigned int erofs_inode_datalayout(unsigned int value)
+static inline unsigned int erofs_inode_datalayout(unsigned int ifmt)
{
- return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT,
- EROFS_I_DATALAYOUT_BITS);
+ return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK;
}
-/*
- * Different from grab_cache_page_nowait(), reclaiming is never triggered
- * when allocating new pages.
- */
-static inline
-struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
- pgoff_t index)
+/* reclaiming is never triggered when allocating new folios. */
+static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
+ pgoff_t index)
{
- return pagecache_get_page(mapping, index,
+ return __filemap_get_folio(as, index,
FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
- readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+ readahead_gfp_mask(as) & ~__GFP_RECLAIM);
}
-extern const struct super_operations erofs_sops;
-extern struct file_system_type erofs_fs_type;
-
-extern const struct address_space_operations erofs_raw_access_aops;
-extern const struct address_space_operations z_erofs_aops;
-
-enum {
- BH_Encoded = BH_PrivateStart,
- BH_FullMapped,
- BH_Fragment,
- BH_Partialref,
-};
-
/* Has a disk mapping */
-#define EROFS_MAP_MAPPED (1 << BH_Mapped)
+#define EROFS_MAP_MAPPED 0x0001
/* Located in metadata (could be copied from bd_inode) */
-#define EROFS_MAP_META (1 << BH_Meta)
+#define EROFS_MAP_META 0x0002
/* The extent is encoded */
-#define EROFS_MAP_ENCODED (1 << BH_Encoded)
+#define EROFS_MAP_ENCODED 0x0004
/* The length of extent is full */
-#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
+#define EROFS_MAP_FULL_MAPPED 0x0008
/* Located in the special packed inode */
-#define EROFS_MAP_FRAGMENT (1 << BH_Fragment)
+#define __EROFS_MAP_FRAGMENT 0x0010
/* The extent refers to partial decompressed data */
-#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref)
+#define EROFS_MAP_PARTIAL_REF 0x0020
+
+#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT)
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -419,17 +367,15 @@ struct erofs_map_blocks {
unsigned int m_flags;
};
-/* Flags used by erofs_map_blocks_flatmode() */
-#define EROFS_GET_BLOCKS_RAW 0x0001
/*
* Used to get the exact decompressed length, e.g. fiemap (consider lookback
* approach instead if possible since it's more metadata lightweight.)
*/
-#define EROFS_GET_BLOCKS_FIEMAP 0x0002
+#define EROFS_GET_BLOCKS_FIEMAP 0x0001
/* Used to map the whole extent if non-negligible data is requested for LZMA */
-#define EROFS_GET_BLOCKS_READMORE 0x0004
+#define EROFS_GET_BLOCKS_READMORE 0x0002
/* Used to map tail extent for tailpacking inline or fragment pcluster */
-#define EROFS_GET_BLOCKS_FINDTAIL 0x0008
+#define EROFS_GET_BLOCKS_FINDTAIL 0x0004
enum {
Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
@@ -437,76 +383,59 @@ enum {
Z_EROFS_COMPRESSION_RUNTIME_MAX
};
-/* zmap.c */
-extern const struct iomap_ops z_erofs_iomap_report_ops;
-
-#ifdef CONFIG_EROFS_FS_ZIP
-int z_erofs_fill_inode(struct inode *inode);
-int z_erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags);
-#else
-static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; }
-static inline int z_erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags)
-{
- return -EOPNOTSUPP;
-}
-#endif /* !CONFIG_EROFS_FS_ZIP */
-
struct erofs_map_dev {
- struct erofs_fscache *m_fscache;
+ struct super_block *m_sb;
+ struct erofs_device_info *m_dif;
struct block_device *m_bdev;
- struct dax_device *m_daxdev;
- u64 m_dax_part_off;
erofs_off_t m_pa;
unsigned int m_deviceid;
};
-/* data.c */
+extern const struct super_operations erofs_sops;
+
+extern const struct address_space_operations erofs_aops;
+extern const struct address_space_operations erofs_fileio_aops;
+extern const struct address_space_operations z_erofs_aops;
+extern const struct address_space_operations erofs_fscache_access_aops;
+
+extern const struct inode_operations erofs_generic_iops;
+extern const struct inode_operations erofs_symlink_iops;
+extern const struct inode_operations erofs_fast_symlink_iops;
+extern const struct inode_operations erofs_dir_iops;
+
extern const struct file_operations erofs_file_fops;
+extern const struct file_operations erofs_dir_fops;
+
+extern const struct iomap_ops z_erofs_iomap_report_ops;
+
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_SHARE 0x0001
+#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002
+
+void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
+ erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
- erofs_blk_t blkaddr, enum erofs_kmap_type type);
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap);
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type);
+ erofs_off_t offset, bool in_metabox);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
-int erofs_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map, int flags);
-
-/* inode.c */
-static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
-{
-#if BITS_PER_LONG == 32
- return (nid >> 32) ^ (nid & 0xffffffff);
-#else
- return nid;
-#endif
-}
-
-extern const struct inode_operations erofs_generic_iops;
-extern const struct inode_operations erofs_symlink_iops;
-extern const struct inode_operations erofs_fast_symlink_iops;
-
+int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
+void erofs_onlinefolio_init(struct folio *folio);
+void erofs_onlinefolio_split(struct folio *folio);
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
-int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags);
-
-/* namei.c */
-extern const struct inode_operations erofs_dir_iops;
-
int erofs_namei(struct inode *dir, const struct qstr *name,
erofs_nid_t *nid, unsigned int *d_type);
-/* dir.c */
-extern const struct file_operations erofs_dir_fops;
-
static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
{
int retried = 0;
@@ -522,23 +451,17 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
return NULL;
}
-/* pcpubuf.c */
-void *erofs_get_pcpubuf(unsigned int requiredpages);
-void erofs_put_pcpubuf(void *ptr);
-int erofs_pcpubuf_growsize(unsigned int nrpages);
-void erofs_pcpubuf_init(void);
-void erofs_pcpubuf_exit(void);
-
-/* sysfs.c */
int erofs_register_sysfs(struct super_block *sb);
void erofs_unregister_sysfs(struct super_block *sb);
int __init erofs_init_sysfs(void);
void erofs_exit_sysfs(void);
-/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
-static inline void erofs_pagepool_add(struct page **pagepool,
- struct page *page)
+struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv);
+static inline struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
+{
+ return __erofs_allocpage(pagepool, gfp, false);
+}
+static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
{
set_page_private(page, (unsigned long)*pagepool);
*pagepool = page;
@@ -546,78 +469,53 @@ static inline void erofs_pagepool_add(struct page **pagepool,
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-int erofs_workgroup_put(struct erofs_workgroup *grp);
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index);
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+
+extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
-int __init z_erofs_init_zip_subsystem(void);
-void z_erofs_exit_zip_subsystem(void);
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
-int erofs_try_to_free_cached_page(struct page *page);
-int z_erofs_load_lz4_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lz4_cfgs *lz4, int len);
+int __init z_erofs_init_subsystem(void);
+void z_erofs_exit_subsystem(void);
+int z_erofs_init_super(struct super_block *sb);
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink);
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
+ int flags);
+void *z_erofs_get_gbuf(unsigned int requiredpages);
+void z_erofs_put_gbuf(void *ptr);
+int z_erofs_gbuf_growsize(unsigned int nrpages);
+int __init z_erofs_gbuf_init(void);
+void z_erofs_gbuf_exit(void);
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
-static inline int z_erofs_init_zip_subsystem(void) { return 0; }
-static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline int z_erofs_load_lz4_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lz4_cfgs *lz4, int len)
-{
- if (lz4 || dsb->u1.lz4_max_distance) {
- erofs_err(sb, "lz4 algorithm isn't enabled");
- return -EINVAL;
- }
- return 0;
-}
+static inline int z_erofs_init_subsystem(void) { return 0; }
+static inline void z_erofs_exit_subsystem(void) {}
+static inline int z_erofs_init_super(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
-#ifdef CONFIG_EROFS_FS_ZIP_LZMA
-int z_erofs_lzma_init(void);
-void z_erofs_lzma_exit(void);
-int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lzma_cfgs *lzma, int size);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fileio_submit_bio(struct bio *bio);
#else
-static inline int z_erofs_lzma_init(void) { return 0; }
-static inline int z_erofs_lzma_exit(void) { return 0; }
-static inline int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb,
- struct z_erofs_lzma_cfgs *lzma, int size) {
- if (lzma) {
- erofs_err(sb, "lzma algorithm isn't enabled");
- return -EINVAL;
- }
- return 0;
-}
-#endif /* !CONFIG_EROFS_FS_ZIP */
-
-/* flags for erofs_fscache_register_cookie() */
-#define EROFS_REG_COOKIE_NEED_INODE 1
-#define EROFS_REG_COOKIE_NEED_NOEXIST 2
+static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fileio_submit_bio(struct bio *bio) {}
+#endif
-/* fscache.c */
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name,
- unsigned int flags);
+ char *name, unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
-
-extern const struct address_space_operations erofs_fscache_access_aops;
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fscache_submit_bio(struct bio *bio);
#else
static inline int erofs_fscache_register_fs(struct super_block *sb)
{
@@ -627,8 +525,7 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
static inline
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name,
- unsigned int flags)
+ char *name, unsigned int flags)
{
return ERR_PTR(-EOPNOTSUPP);
}
@@ -636,8 +533,14 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
{
}
+static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg);
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index b64a108fac92..f7cf4f41af28 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -5,7 +5,6 @@
* Copyright (C) 2022, Alibaba Cloud
*/
#include "xattr.h"
-
#include <trace/events/erofs.h>
struct erofs_qstr {
@@ -87,28 +86,23 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
return ERR_PTR(-ENOENT);
}
-static void *find_target_block_classic(struct erofs_buf *target,
- struct inode *dir,
- struct erofs_qstr *name,
- int *_ndirents)
+static void *erofs_find_target_block(struct erofs_buf *target,
+ struct inode *dir, struct erofs_qstr *name, int *_ndirents)
{
- unsigned int startprfx, endprfx;
- int head, back;
+ unsigned int bsz = i_blocksize(dir);
+ int head = 0, back = erofs_iblks(dir) - 1;
+ unsigned int startprfx = 0, endprfx = 0;
void *candidate = ERR_PTR(-ENOENT);
- startprfx = endprfx = 0;
- head = 0;
- back = erofs_inode_datablocks(dir) - 1;
-
while (head <= back) {
const int mid = head + (back - head) / 2;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
- de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
+ buf.mapping = dir->i_mapping;
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true);
if (!IS_ERR(de)) {
- const int nameoff = nameoff_from_disk(de->nameoff,
- EROFS_BLKSIZ);
+ const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
int diff;
unsigned int matched;
@@ -128,33 +122,32 @@ static void *find_target_block_classic(struct erofs_buf *target,
dname.name = (u8 *)de + nameoff;
if (ndirents == 1)
- dname.end = (u8 *)de + EROFS_BLKSIZ;
+ dname.end = (u8 *)de + bsz;
else
dname.end = (u8 *)de +
- nameoff_from_disk(de[1].nameoff,
- EROFS_BLKSIZ);
+ nameoff_from_disk(de[1].nameoff, bsz);
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- if (!diff) {
- *_ndirents = 0;
- goto out;
- } else if (diff > 0) {
- head = mid + 1;
- startprfx = matched;
-
- if (!IS_ERR(candidate))
- erofs_put_metabuf(target);
- *target = buf;
- candidate = de;
- *_ndirents = ndirents;
- } else {
+ if (diff < 0) {
erofs_put_metabuf(&buf);
-
back = mid - 1;
endprfx = matched;
+ continue;
}
+
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ if (!diff) {
+ *_ndirents = 0;
+ return de;
+ }
+ head = mid + 1;
+ startprfx = matched;
+ candidate = de;
+ *_ndirents = ndirents;
continue;
}
out: /* free if the candidate is valid */
@@ -178,15 +171,16 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
qn.name = name->name;
qn.end = name->name + name->len;
+ buf.mapping = dir->i_mapping;
ndirents = 0;
-
- de = find_target_block_classic(&buf, dir, &qn, &ndirents);
+ de = erofs_find_target_block(&buf, dir, &qn, &ndirents);
if (IS_ERR(de))
return PTR_ERR(de);
if (ndirents)
- de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
+ de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir),
+ ndirents);
if (!IS_ERR(de)) {
*nid = le64_to_cpu(de->nid);
@@ -211,16 +205,13 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
- if (err == -ENOENT) {
+ if (err == -ENOENT)
/* negative dentry */
inode = NULL;
- } else if (err) {
+ else if (err)
inode = ERR_PTR(err);
- } else {
- erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
- dentry, nid, d_type);
+ else
inode = erofs_iget(dir->i_sb, nid);
- }
return d_splice_alias(inode, dentry);
}
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
deleted file mode 100644
index a2efd833d1b6..000000000000
--- a/fs/erofs/pcpubuf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) Gao Xiang <xiang@kernel.org>
- *
- * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
- * per-CPU virtual memory (in pages) in advance to store such inplace I/O
- * data if inplace decompression is failed (due to unmet inplace margin for
- * example).
- */
-#include "internal.h"
-
-struct erofs_pcpubuf {
- raw_spinlock_t lock;
- void *ptr;
- struct page **pages;
- unsigned int nrpages;
-};
-
-static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
-
-void *erofs_get_pcpubuf(unsigned int requiredpages)
- __acquires(pcb->lock)
-{
- struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
-
- raw_spin_lock(&pcb->lock);
- /* check if the per-CPU buffer is too small */
- if (requiredpages > pcb->nrpages) {
- raw_spin_unlock(&pcb->lock);
- put_cpu_var(erofs_pcb);
- /* (for sparse checker) pretend pcb->lock is still taken */
- __acquire(pcb->lock);
- return NULL;
- }
- return pcb->ptr;
-}
-
-void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
-{
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
-
- DBG_BUGON(pcb->ptr != ptr);
- raw_spin_unlock(&pcb->lock);
- put_cpu_var(erofs_pcb);
-}
-
-/* the next step: support per-CPU page buffers hotplug */
-int erofs_pcpubuf_growsize(unsigned int nrpages)
-{
- static DEFINE_MUTEX(pcb_resize_mutex);
- static unsigned int pcb_nrpages;
- struct page *pagepool = NULL;
- int delta, cpu, ret, i;
-
- mutex_lock(&pcb_resize_mutex);
- delta = nrpages - pcb_nrpages;
- ret = 0;
- /* avoid shrinking pcpubuf, since no idea how many fses rely on */
- if (delta <= 0)
- goto out;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
- struct page **pages, **oldpages;
- void *ptr, *old_ptr;
-
- pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- break;
- }
-
- for (i = 0; i < nrpages; ++i) {
- pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
- if (!pages[i]) {
- ret = -ENOMEM;
- oldpages = pages;
- goto free_pagearray;
- }
- }
- ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
- if (!ptr) {
- ret = -ENOMEM;
- oldpages = pages;
- goto free_pagearray;
- }
- raw_spin_lock(&pcb->lock);
- old_ptr = pcb->ptr;
- pcb->ptr = ptr;
- oldpages = pcb->pages;
- pcb->pages = pages;
- i = pcb->nrpages;
- pcb->nrpages = nrpages;
- raw_spin_unlock(&pcb->lock);
-
- if (!oldpages) {
- DBG_BUGON(old_ptr);
- continue;
- }
-
- if (old_ptr)
- vunmap(old_ptr);
-free_pagearray:
- while (i)
- erofs_pagepool_add(&pagepool, oldpages[--i]);
- kfree(oldpages);
- if (ret)
- break;
- }
- pcb_nrpages = nrpages;
- erofs_release_pages(&pagepool);
-out:
- mutex_unlock(&pcb_resize_mutex);
- return ret;
-}
-
-void erofs_pcpubuf_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
- raw_spin_lock_init(&pcb->lock);
- }
-}
-
-void erofs_pcpubuf_exit(void)
-{
- int cpu, i;
-
- for_each_possible_cpu(cpu) {
- struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
-
- if (pcb->ptr) {
- vunmap(pcb->ptr);
- pcb->ptr = NULL;
- }
- if (!pcb->pages)
- continue;
-
- for (i = 0; i < pcb->nrpages; ++i)
- if (pcb->pages[i])
- put_page(pcb->pages[i]);
- kfree(pcb->pages);
- pcb->pages = NULL;
- }
-}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 626a615dafc2..937a215f626c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -4,16 +4,13 @@
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
-#include <linux/module.h>
-#include <linux/buffer_head.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/crc32c.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
-#include <linux/dax.h>
#include <linux/exportfs.h>
+#include <linux/backing-dev.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -21,58 +18,42 @@
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-void _erofs_err(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
-
- pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
- va_end(args);
-}
-
-void _erofs_info(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- pr_info("(device %s): %pV", sb->s_id, &vaf);
+ if (sb)
+ printk("%c%cerofs (device %s): %pV",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
+ else
+ printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
{
- struct erofs_super_block *dsb;
- u32 expected_crc, crc;
-
- dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET,
- EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL);
- if (!dsb)
- return -ENOMEM;
+ struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET;
+ u32 len = 1 << EROFS_SB(sb)->blkszbits, crc;
- expected_crc = le32_to_cpu(dsb->checksum);
- dsb->checksum = 0;
- /* to allow for x86 boot sectors and other oddities. */
- crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET);
- kfree(dsb);
+ if (len > EROFS_SUPER_OFFSET)
+ len -= EROFS_SUPER_OFFSET;
+ len -= offsetof(struct erofs_super_block, checksum) +
+ sizeof(dsb->checksum);
- if (crc != expected_crc) {
- erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
- crc, expected_crc);
- return -EBADMSG;
- }
- return 0;
+ /* skip .magic(pre-verified) and .checksum(0) fields */
+ crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len);
+ if (crc == le32_to_cpu(dsb->checksum))
+ return 0;
+ erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+ crc, le32_to_cpu(dsb->checksum));
+ return -EBADMSG;
}
static void erofs_inode_init_once(void *ptr)
@@ -99,45 +80,25 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- /* be careful of RCU symlink path */
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
-
kmem_cache_free(erofs_inode_cachep, vi);
}
-static bool check_layout_compatibility(struct super_block *sb,
- struct erofs_super_block *dsb)
-{
- const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
-
- EROFS_SB(sb)->feature_incompat = feature;
-
- /* check if current kernel meets all mandatory requirements */
- if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
- erofs_err(sb,
- "unidentified incompatible feature %x, please upgrade kernel version",
- feature & ~EROFS_ALL_FEATURE_INCOMPAT);
- return false;
- }
- return true;
-}
-
-#ifdef CONFIG_EROFS_FS_ZIP
/* read variable-sized metadata, offset will be aligned by 4-byte */
-static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
- erofs_off_t *offset, int *lengthp)
+void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
+ erofs_off_t *offset, int *lengthp)
{
u8 *buffer, *ptr;
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr))
return ptr;
- len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+ len = le16_to_cpu(*(__le16 *)ptr);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
@@ -147,76 +108,28 @@ static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
*lengthp = len;
for (i = 0; i < len; i += cnt) {
- cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
- ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset),
- EROFS_KMAP);
+ cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
+ len - i);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
}
- memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+ memcpy(buffer + i, ptr, cnt);
*offset += cnt;
}
return buffer;
}
-static int erofs_load_compr_cfgs(struct super_block *sb,
- struct erofs_super_block *dsb)
+#ifndef CONFIG_EROFS_FS_ZIP
+static int z_erofs_parse_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- unsigned int algs, alg;
- erofs_off_t offset;
- int size, ret = 0;
-
- sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
- if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
- erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
- sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
- return -EINVAL;
- }
-
- offset = EROFS_SUPER_OFFSET + sbi->sb_size;
- alg = 0;
- for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
- void *data;
-
- if (!(algs & 1))
- continue;
-
- data = erofs_read_metadata(sb, &buf, &offset, &size);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
+ if (!dsb->u1.available_compr_algs)
+ return 0;
- switch (alg) {
- case Z_EROFS_COMPRESSION_LZ4:
- ret = z_erofs_load_lz4_config(sb, dsb, data, size);
- break;
- case Z_EROFS_COMPRESSION_LZMA:
- ret = z_erofs_load_lzma_config(sb, dsb, data, size);
- break;
- default:
- DBG_BUGON(1);
- ret = -EFAULT;
- }
- kfree(data);
- if (ret)
- break;
- }
- erofs_put_metabuf(&buf);
- return ret;
-}
-#else
-static int erofs_load_compr_cfgs(struct super_block *sb,
- struct erofs_super_block *dsb)
-{
- if (dsb->u1.available_compr_algs) {
- erofs_err(sb, "try to load compressed fs when compression is disabled");
- return -EINVAL;
- }
- return 0;
+ erofs_err(sb, "compression disabled, unable to mount compressed EROFS");
+ return -EOPNOTSUPP;
}
#endif
@@ -226,15 +139,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct block_device *bdev;
- void *ptr;
+ struct file *file;
- ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
- dis = ptr + erofs_blkoff(*pos);
+ dis = erofs_read_metabuf(buf, sb, *pos, false);
+ if (IS_ERR(dis))
+ return PTR_ERR(dis);
- if (!dif->path) {
+ if (!sbi->devs->flatdev && !dif->path) {
if (!dis->tag[0]) {
erofs_err(sb, "empty device tag @ pos %llu", *pos);
return -EINVAL;
@@ -249,18 +160,34 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (IS_ERR(fscache))
return PTR_ERR(fscache);
dif->fscache = fscache;
- } else {
- bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
- sb->s_type);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
- NULL, NULL);
+ } else if (!sbi->devs->flatdev) {
+ file = erofs_is_fileio_mode(sbi) ?
+ filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
+ bdev_file_open_by_path(dif->path,
+ BLK_OPEN_READ, sb->s_type, NULL);
+ if (IS_ERR(file)) {
+ if (file == ERR_PTR(-ENOTBLK))
+ return -EINVAL;
+ return PTR_ERR(file);
+ }
+
+ if (!erofs_is_fileio_mode(sbi)) {
+ dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
+ &dif->dax_part_off, NULL, NULL);
+ } else if (!S_ISREG(file_inode(file)->i_mode)) {
+ fput(file);
+ return -EINVAL;
+ }
+ if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) {
+ erofs_info(sb, "DAX unsupported by %s. Turning off DAX.",
+ dif->path);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
+ dif->file = file;
}
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ dif->blocks = le32_to_cpu(dis->blocks_lo);
+ dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
sbi->total_blocks += dif->blocks;
*pos += EROFS_DEVT_SLOT_SIZE;
return 0;
@@ -276,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb,
struct erofs_device_info *dif;
int id, err = 0;
- sbi->total_blocks = sbi->primarydevice_blocks;
+ sbi->total_blocks = sbi->dif0.blocks;
if (!erofs_sb_has_device_table(sbi))
ondisk_extradevs = 0;
else
@@ -288,9 +215,17 @@ static int erofs_scan_devices(struct super_block *sb,
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
}
+
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) {
+ erofs_info(sb, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
if (!ondisk_extradevs)
return 0;
+ if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
+ sbi->devs->flatdev = true;
+
sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
down_read(&sbi->devs->rwsem);
@@ -327,28 +262,35 @@ static int erofs_scan_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
- struct erofs_sb_info *sbi;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
- unsigned int blkszbits;
void *data;
int ret;
- data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ data = erofs_read_metabuf(&buf, sb, 0, false);
if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
return PTR_ERR(data);
}
- sbi = EROFS_SB(sb);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
-
ret = -EINVAL;
if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
erofs_err(sb, "cannot find valid erofs superblock");
goto out;
}
+ sbi->blkszbits = dsb->blkszbits;
+ if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) {
+ erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits);
+ goto out;
+ }
+ if (dsb->dirblkbits) {
+ erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits);
+ goto out;
+ }
+
sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
if (erofs_sb_has_sb_chksum(sbi)) {
ret = erofs_superblock_csum_verify(sb, data);
@@ -357,106 +299,93 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = -EINVAL;
- blkszbits = dsb->blkszbits;
- /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
- if (blkszbits != LOG_BLOCK_SIZE) {
- erofs_err(sb, "blkszbits %u isn't supported on this platform",
- blkszbits);
+ sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat);
+ if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) {
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
+ sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT);
goto out;
}
- if (!check_layout_compatibility(sb, dsb))
- goto out;
-
sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
- if (sbi->sb_size > EROFS_BLKSIZ) {
+ if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
sbi->sb_size);
goto out;
}
- sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+ sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
+ sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start);
+ sbi->xattr_prefix_count = dsb->xattr_prefix_count;
+ sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
- sbi->root_nid = le16_to_cpu(dsb->root_nid);
-#ifdef CONFIG_EROFS_FS_ZIP
- sbi->packed_inode = NULL;
- if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) {
- sbi->packed_inode =
- erofs_iget(sb, le64_to_cpu(dsb->packed_nid));
- if (IS_ERR(sbi->packed_inode)) {
- ret = PTR_ERR(sbi->packed_inode);
- goto out;
- }
+ if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+ sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+ sbi->dif0.blocks = sbi->dif0.blocks |
+ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
+ } else {
+ sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+ }
+ sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
+ if (erofs_sb_has_metabox(sbi)) {
+ if (sbi->sb_size <= offsetof(struct erofs_super_block,
+ metabox_nid))
+ return -EFSCORRUPTED;
+ sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid);
+ if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT))
+ return -EFSCORRUPTED; /* self-loop detection */
}
-#endif
sbi->inos = le64_to_cpu(dsb->inos);
- sbi->build_time = le64_to_cpu(dsb->build_time);
- sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
-
- memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+ sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
+ sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
+ super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
- ret = strscpy(sbi->volume_name, dsb->volume_name,
- sizeof(dsb->volume_name));
- if (ret < 0) { /* -E2BIG */
- erofs_err(sb, "bad volume name without NIL terminator");
- ret = -EFSCORRUPTED;
- goto out;
+ if (dsb->volume_name[0]) {
+ sbi->volume_name = kstrndup(dsb->volume_name,
+ sizeof(dsb->volume_name), GFP_KERNEL);
+ if (!sbi->volume_name)
+ return -ENOMEM;
}
/* parse on-disk compression configurations */
- if (erofs_sb_has_compr_cfgs(sbi))
- ret = erofs_load_compr_cfgs(sb, dsb);
- else
- ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
goto out;
- /* handle multiple devices */
ret = erofs_scan_devices(sb, dsb);
- if (erofs_sb_has_ztailpacking(sbi))
- erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
+ if (erofs_sb_has_48bit(sbi))
+ erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
+ if (erofs_sb_has_metabox(sbi))
+ erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
- erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
- if (erofs_sb_has_fragments(sbi))
- erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
- if (erofs_sb_has_dedupe(sbi))
- erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
+ erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
}
-/* set up default EROFS parameters */
-static void erofs_default_options(struct erofs_fs_context *ctx)
+static void erofs_default_options(struct erofs_sb_info *sbi)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->opt.max_sync_decompress_pages = 3;
- ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
+ sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ sbi->opt.max_sync_decompress_pages = 3;
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(&ctx->opt, XATTR_USER);
+ set_opt(&sbi->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(&ctx->opt, POSIX_ACL);
+ set_opt(&sbi->opt, POSIX_ACL);
#endif
}
enum {
- Opt_user_xattr,
- Opt_acl,
- Opt_cache_strategy,
- Opt_dax,
- Opt_dax_enum,
- Opt_device,
- Opt_fsid,
- Opt_domain_id,
- Opt_err
+ Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
+ Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
};
static const struct constant_table erofs_param_cache_strategy[] = {
@@ -482,23 +411,24 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("device", Opt_device),
fsparam_string("fsid", Opt_fsid),
fsparam_string("domain_id", Opt_domain_id),
+ fsparam_flag_no("directio", Opt_directio),
+ fsparam_u64("fsoffset", Opt_fsoffset),
{}
};
static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
{
#ifdef CONFIG_FS_DAX
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
- warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(&ctx->opt, DAX_ALWAYS);
- clear_opt(&ctx->opt, DAX_NEVER);
+ set_opt(&sbi->opt, DAX_ALWAYS);
+ clear_opt(&sbi->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(&ctx->opt, DAX_NEVER);
- clear_opt(&ctx->opt, DAX_ALWAYS);
+ set_opt(&sbi->opt, DAX_NEVER);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -513,7 +443,7 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
struct fs_parse_result result;
struct erofs_device_info *dif;
int opt, ret;
@@ -526,9 +456,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(&ctx->opt, XATTR_USER);
+ set_opt(&sbi->opt, XATTR_USER);
else
- clear_opt(&ctx->opt, XATTR_USER);
+ clear_opt(&sbi->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -536,16 +466,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(&ctx->opt, POSIX_ACL);
+ set_opt(&sbi->opt, POSIX_ACL);
else
- clear_opt(&ctx->opt, POSIX_ACL);
+ clear_opt(&sbi->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->opt.cache_strategy = result.uint_32;
+ sbi->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -567,27 +497,27 @@ static int erofs_fc_parse_param(struct fs_context *fc,
kfree(dif);
return -ENOMEM;
}
- down_write(&ctx->devs->rwsem);
- ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
- up_write(&ctx->devs->rwsem);
+ down_write(&sbi->devs->rwsem);
+ ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&sbi->devs->rwsem);
if (ret < 0) {
kfree(dif->path);
kfree(dif);
return ret;
}
- ++ctx->devs->extra_devices;
+ ++sbi->devs->extra_devices;
break;
#ifdef CONFIG_EROFS_FS_ONDEMAND
case Opt_fsid:
- kfree(ctx->fsid);
- ctx->fsid = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->fsid)
+ kfree(sbi->fsid);
+ sbi->fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!sbi->fsid)
return -ENOMEM;
break;
case Opt_domain_id:
- kfree(ctx->domain_id);
- ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
- if (!ctx->domain_id)
+ kfree(sbi->domain_id);
+ sbi->domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!sbi->domain_id)
return -ENOMEM;
break;
#else
@@ -596,92 +526,69 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
break;
#endif
- default:
- return -ENOPARAM;
+ case Opt_directio:
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (result.boolean)
+ set_opt(&sbi->opt, DIRECT_IO);
+ else
+ clear_opt(&sbi->opt, DIRECT_IO);
+#else
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+ break;
+ case Opt_fsoffset:
+ sbi->dif0.fsoff = result.uint_64;
+ break;
}
return 0;
}
-#ifdef CONFIG_EROFS_FS_ZIP
-static const struct address_space_operations managed_cache_aops;
-
-static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
+static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
- bool ret = true;
- struct address_space *const mapping = folio->mapping;
-
- DBG_BUGON(!folio_test_locked(folio));
- DBG_BUGON(mapping->a_ops != &managed_cache_aops);
-
- if (folio_test_private(folio))
- ret = erofs_try_to_free_cached_page(&folio->page);
-
- return ret;
-}
+ erofs_nid_t nid = EROFS_I(inode)->nid;
+ int len = parent ? 6 : 3;
-/*
- * It will be called only on inode eviction. In case that there are still some
- * decompression requests in progress, wait with rescheduling for a bit here.
- * We could introduce an extra locking instead but it seems unnecessary.
- */
-static void erofs_managed_cache_invalidate_folio(struct folio *folio,
- size_t offset, size_t length)
-{
- const size_t stop = length + offset;
-
- DBG_BUGON(!folio_test_locked(folio));
-
- /* Check for potential overflow in debug mode */
- DBG_BUGON(stop > folio_size(folio) || stop < length);
-
- if (offset == 0 && stop == folio_size(folio))
- while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
- cond_resched();
-}
-
-static const struct address_space_operations managed_cache_aops = {
- .release_folio = erofs_managed_cache_release_folio,
- .invalidate_folio = erofs_managed_cache_invalidate_folio,
-};
-
-static int erofs_init_managed_cache(struct super_block *sb)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct inode *const inode = new_inode(sb);
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
- if (!inode)
- return -ENOMEM;
+ fh[0] = (u32)(nid >> 32);
+ fh[1] = (u32)(nid & 0xffffffff);
+ fh[2] = inode->i_generation;
- set_nlink(inode, 1);
- inode->i_size = OFFSET_MAX;
+ if (parent) {
+ nid = EROFS_I(parent)->nid;
- inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- sbi->managed_cache = inode;
- return 0;
-}
-#else
-static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
-#endif
+ fh[3] = (u32)(nid >> 32);
+ fh[4] = (u32)(nid & 0xffffffff);
+ fh[5] = parent->i_generation;
+ }
-static struct inode *erofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
-{
- return erofs_iget(sb, ino);
+ *max_len = len;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[0] << 32) | fid->raw[1]));
}
static struct dentry *erofs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[3] << 32) | fid->raw[4]));
}
static struct dentry *erofs_get_parent(struct dentry *child)
@@ -697,23 +604,32 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
+ .encode_fh = erofs_encode_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
};
-static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+static void erofs_set_sysfs_name(struct super_block *sb)
{
- static const struct tree_descr empty_descr = {""};
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+ if (sbi->domain_id)
+ super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id,
+ sbi->fsid);
+ else if (sbi->fsid)
+ super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
+ else if (erofs_is_fileio_mode(sbi))
+ super_set_sysfs_name_generic(sb, "%s",
+ bdi_dev_name(sb->s_bdi));
+ else
+ super_set_sysfs_name_id(sb);
}
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
- struct erofs_sb_info *sbi;
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
@@ -721,54 +637,77 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &erofs_sops;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sb->s_fs_info = sbi;
- sbi->opt = ctx->opt;
- sbi->devs = ctx->devs;
- ctx->devs = NULL;
- sbi->fsid = ctx->fsid;
- ctx->fsid = NULL;
- sbi->domain_id = ctx->domain_id;
- ctx->domain_id = NULL;
-
- if (erofs_is_fscache_mode(sb)) {
- sb->s_blocksize = EROFS_BLKSIZ;
- sb->s_blocksize_bits = LOG_BLOCK_SIZE;
-
- err = erofs_fscache_register_fs(sb);
- if (err)
- return err;
+ sbi->blkszbits = PAGE_SHIFT;
+ if (!sb->s_bdev) {
+ /*
+ * (File-backed mounts) EROFS claims it's safe to nest other
+ * fs contexts (including its own) due to self-controlled RO
+ * accesses/contexts and no side-effect changes that need to
+ * context save & restore so it can reuse the current thread
+ * context. However, it still needs to bump `s_stack_depth` to
+ * avoid kernel stack overflow from nested filesystems.
+ */
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_stack_depth =
+ file_inode(sbi->dif0.file)->i_sb->s_stack_depth + 1;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ erofs_err(sb, "maximum fs stacking depth exceeded");
+ return -ENOTBLK;
+ }
+ }
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+ if (erofs_is_fscache_mode(sb)) {
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+ }
err = super_setup_bdi(sb);
if (err)
return err;
} else {
- if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
- erofs_err(sb, "failed to set erofs blksize");
+ if (!sb_set_blocksize(sb, PAGE_SIZE)) {
+ errorfc(fc, "failed to set initial blksize");
return -EINVAL;
}
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
- &sbi->dax_part_off,
- NULL, NULL);
+ sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dif0.dax_part_off, NULL, NULL);
}
err = erofs_read_superblock(sb);
if (err)
return err;
- if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE);
+ if (sb->s_blocksize_bits != sbi->blkszbits) {
+ if (erofs_is_fscache_mode(sb)) {
+ errorfc(fc, "unsupported blksize for fscache mode");
+ return -EINVAL;
+ }
- if (!sbi->dax_dev) {
- errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(&sbi->opt, DAX_ALWAYS);
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_blocksize = 1 << sbi->blkszbits;
+ sb->s_blocksize_bits = sbi->blkszbits;
+ } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+ errorfc(fc, "failed to set erofs blksize");
+ return -EINVAL;
}
}
+ if (sbi->dif0.fsoff) {
+ if (sbi->dif0.fsoff & (sb->s_blocksize - 1))
+ return invalfc(fc, "fsoffset %llu is not aligned to block size %lu",
+ sbi->dif0.fsoff, sb->s_blocksize);
+ if (erofs_is_fscache_mode(sb))
+ return invalfc(fc, "cannot use fsoffset in fscache mode");
+ }
+
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) {
+ erofs_info(sb, "unsupported blocksize for DAX");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
+
sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
sb->s_export_op = &erofs_export_ops;
@@ -778,72 +717,99 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
else
sb->s_flags &= ~SB_POSIXACL;
-#ifdef CONFIG_EROFS_FS_ZIP
- xa_init(&sbi->managed_pslots);
-#endif
+ err = z_erofs_init_super(sb);
+ if (err)
+ return err;
+
+ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
+ inode = erofs_iget(sb, sbi->packed_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->packed_inode = inode;
+ }
+ if (erofs_sb_has_metabox(sbi)) {
+ inode = erofs_iget(sb, sbi->metabox_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->metabox_inode = inode;
+ }
- /* get the root inode */
- inode = erofs_iget(sb, ROOT_NID(sbi));
+ inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
return PTR_ERR(inode);
if (!S_ISDIR(inode->i_mode)) {
erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
- ROOT_NID(sbi), inode->i_mode);
+ sbi->root_nid, inode->i_mode);
iput(inode);
return -EINVAL;
}
-
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
erofs_shrinker_register(sb);
- /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
- err = erofs_init_managed_cache(sb);
+ err = erofs_xattr_prefixes_init(sb);
if (err)
return err;
+ erofs_set_sysfs_name(sb);
err = erofs_register_sysfs(sb);
if (err)
return err;
- erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
+ sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES;
+ erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
-static int erofs_fc_anon_get_tree(struct fs_context *fc)
-{
- return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
-}
-
static int erofs_fc_get_tree(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
+ int ret;
- if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid)
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- return get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
+ IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
+ GET_TREE_BDEV_QUIET_LOOKUP : 0);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (ret == -ENOTBLK) {
+ struct file *file;
+
+ if (!fc->source)
+ return invalf(fc, "No source specified");
+ file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ sbi->dif0.file = file;
+
+ if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) &&
+ sbi->dif0.file->f_mapping->a_ops->read_folio)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+ }
+#endif
+ return ret;
}
static int erofs_fc_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *new_sbi = fc->s_fs_info;
DBG_BUGON(!sb_rdonly(sb));
- if (ctx->fsid || ctx->domain_id)
+ if (new_sbi->fsid || new_sbi->domain_id)
erofs_info(sb, "ignoring reconfiguration for fsid|domain_id.");
- if (test_opt(&ctx->opt, POSIX_ACL))
+ if (test_opt(&new_sbi->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->opt = ctx->opt;
+ sbi->opt = new_sbi->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
@@ -854,8 +820,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev)
- blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ if (dif->file)
+ fput(dif->file);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
@@ -872,14 +838,23 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs)
kfree(devs);
}
+static void erofs_sb_free(struct erofs_sb_info *sbi)
+{
+ erofs_free_dev_context(sbi->devs);
+ kfree(sbi->fsid);
+ kfree(sbi->domain_id);
+ if (sbi->dif0.file)
+ fput(sbi->dif0.file);
+ kfree(sbi->volume_name);
+ kfree(sbi);
+}
+
static void erofs_fc_free(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = fc->fs_private;
+ struct erofs_sb_info *sbi = fc->s_fs_info;
- erofs_free_dev_context(ctx->devs);
- kfree(ctx->fsid);
- kfree(ctx->domain_id);
- kfree(ctx);
+ if (sbi) /* free here if an error occurs before transferring to sb */
+ erofs_sb_free(sbi);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -889,90 +864,70 @@ static const struct fs_context_operations erofs_context_ops = {
.free = erofs_fc_free,
};
-static const struct fs_context_operations erofs_anon_context_ops = {
- .get_tree = erofs_fc_anon_get_tree,
-};
-
static int erofs_init_fs_context(struct fs_context *fc)
{
- struct erofs_fs_context *ctx;
-
- /* pseudo mount for anon inodes */
- if (fc->sb_flags & SB_KERNMOUNT) {
- fc->ops = &erofs_anon_context_ops;
- return 0;
- }
+ struct erofs_sb_info *sbi;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
return -ENOMEM;
- ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
- if (!ctx->devs) {
- kfree(ctx);
+
+ sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!sbi->devs) {
+ kfree(sbi);
return -ENOMEM;
}
- fc->fs_private = ctx;
+ fc->s_fs_info = sbi;
- idr_init(&ctx->devs->tree);
- init_rwsem(&ctx->devs->rwsem);
- erofs_default_options(ctx);
+ idr_init(&sbi->devs->tree);
+ init_rwsem(&sbi->devs->rwsem);
+ erofs_default_options(sbi);
fc->ops = &erofs_context_ops;
return 0;
}
-/*
- * could be triggered after deactivate_locked_super()
- * is called, thus including umount and failed to initialize.
- */
-static void erofs_kill_sb(struct super_block *sb)
+static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
{
- struct erofs_sb_info *sbi;
-
- WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+ iput(sbi->metabox_inode);
+ sbi->metabox_inode = NULL;
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+}
- /* pseudo mount for anon inodes */
- if (sb->s_flags & SB_KERNMOUNT) {
- kill_anon_super(sb);
- return;
- }
+static void erofs_kill_sb(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fscache_mode(sb))
+ if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) ||
+ sbi->dif0.file)
kill_anon_super(sb);
else
kill_block_super(sb);
-
- sbi = EROFS_SB(sb);
- if (!sbi)
- return;
-
- erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev, NULL);
+ erofs_drop_internal_inodes(sbi);
+ fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
- kfree(sbi->fsid);
- kfree(sbi->domain_id);
- kfree(sbi);
+ erofs_sb_free(sbi);
sb->s_fs_info = NULL;
}
-/* called when ->s_root is non-NULL */
static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
- DBG_BUGON(!sbi);
-
erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
-#ifdef CONFIG_EROFS_FS_ZIP
- iput(sbi->managed_cache);
- sbi->managed_cache = NULL;
- iput(sbi->packed_inode);
- sbi->packed_inode = NULL;
-#endif
+ erofs_xattr_prefixes_cleanup(sb);
+ erofs_drop_internal_inodes(sbi);
+ erofs_free_dev_context(sbi->devs);
+ sbi->devs = NULL;
erofs_fscache_unregister_fs(sb);
}
-struct file_system_type erofs_fs_type = {
+static struct file_system_type erofs_fs_type = {
.owner = THIS_MODULE,
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
@@ -988,24 +943,17 @@ static int __init erofs_module_init(void)
erofs_check_ondisk_layout_definitions();
erofs_inode_cachep = kmem_cache_create("erofs_inode",
- sizeof(struct erofs_inode), 0,
- SLAB_RECLAIM_ACCOUNT,
- erofs_inode_init_once);
- if (!erofs_inode_cachep) {
- err = -ENOMEM;
- goto icache_err;
- }
+ sizeof(struct erofs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+ erofs_inode_init_once);
+ if (!erofs_inode_cachep)
+ return -ENOMEM;
err = erofs_init_shrinker();
if (err)
goto shrinker_err;
- err = z_erofs_lzma_init();
- if (err)
- goto lzma_err;
-
- erofs_pcpubuf_init();
- err = z_erofs_init_zip_subsystem();
+ err = z_erofs_init_subsystem();
if (err)
goto zip_err;
@@ -1022,14 +970,11 @@ static int __init erofs_module_init(void)
fs_err:
erofs_exit_sysfs();
sysfs_err:
- z_erofs_exit_zip_subsystem();
+ z_erofs_exit_subsystem();
zip_err:
- z_erofs_lzma_exit();
-lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
-icache_err:
return err;
}
@@ -1041,34 +986,29 @@ static void __exit erofs_module_exit(void)
rcu_barrier();
erofs_exit_sysfs();
- z_erofs_exit_zip_subsystem();
- z_erofs_lzma_exit();
+ z_erofs_exit_subsystem();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
- erofs_pcpubuf_exit();
}
-/* get filesystem statistics */
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = 0;
-
- if (!erofs_is_fscache_mode(sb))
- id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
- buf->f_bsize = EROFS_BLKSIZ;
+ buf->f_bsize = sb->s_blocksize;
buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
-
buf->f_files = ULLONG_MAX;
buf->f_ffree = ULLONG_MAX - sbi->inos;
-
buf->f_namelen = EROFS_NAME_LEN;
- buf->f_fsid = u64_to_fsid(id);
+ if (uuid_is_null(&sb->s_uuid))
+ buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 :
+ huge_encode_dev(sb->s_bdev->bd_dev));
+ else
+ buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
return 0;
}
@@ -1077,43 +1017,47 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
struct erofs_mount_opts *opt = &sbi->opt;
-#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(opt, XATTR_USER))
- seq_puts(seq, ",user_xattr");
- else
- seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(opt, POSIX_ACL))
- seq_puts(seq, ",acl");
- else
- seq_puts(seq, ",noacl");
-#endif
-#ifdef CONFIG_EROFS_FS_ZIP
- if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
- seq_puts(seq, ",cache_strategy=disabled");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
- seq_puts(seq, ",cache_strategy=readahead");
- else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
- seq_puts(seq, ",cache_strategy=readaround");
-#endif
+ if (IS_ENABLED(CONFIG_EROFS_FS_XATTR))
+ seq_puts(seq, test_opt(opt, XATTR_USER) ?
+ ",user_xattr" : ",nouser_xattr");
+ if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL))
+ seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl");
+ if (IS_ENABLED(CONFIG_EROFS_FS_ZIP))
+ seq_printf(seq, ",cache_strategy=%s",
+ erofs_param_cache_strategy[opt->cache_strategy].name);
if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+ if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO))
+ seq_puts(seq, ",directio");
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (sbi->fsid)
seq_printf(seq, ",fsid=%s", sbi->fsid);
if (sbi->domain_id)
seq_printf(seq, ",domain_id=%s", sbi->domain_id);
#endif
+ if (sbi->dif0.fsoff)
+ seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
return 0;
}
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+#endif
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
+ .evict_inode = erofs_evict_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
};
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index fd476961f742..1e0658a1d95b 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -7,11 +7,14 @@
#include <linux/kobject.h>
#include "internal.h"
+#include "compress.h"
enum {
attr_feature,
+ attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
+ attr_accel,
};
enum {
@@ -57,11 +60,26 @@ static struct erofs_attr erofs_attr_##_name = { \
#ifdef CONFIG_EROFS_FS_ZIP
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+EROFS_ATTR_FUNC(accel, 0644);
+#endif
+EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info);
-static struct attribute *erofs_attrs[] = {
+static struct attribute *erofs_sb_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
+ ATTR_LIST(drop_caches),
+#endif
+ ATTR_LIST(dir_ra_bytes),
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_sb);
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ ATTR_LIST(accel),
#endif
NULL,
};
@@ -78,6 +96,8 @@ EROFS_ATTR_FEATURE(sb_chksum);
EROFS_ATTR_FEATURE(ztailpacking);
EROFS_ATTR_FEATURE(fragments);
EROFS_ATTR_FEATURE(dedupe);
+EROFS_ATTR_FEATURE(48bit);
+EROFS_ATTR_FEATURE(metabox);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -90,6 +110,8 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(ztailpacking),
ATTR_LIST(fragments),
ATTR_LIST(dedupe),
+ ATTR_LIST(48bit),
+ ATTR_LIST(metabox),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
@@ -123,12 +145,14 @@ static ssize_t erofs_attr_show(struct kobject *kobj,
if (!ptr)
return 0;
return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ case attr_accel:
+ return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n');
}
return 0;
}
static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
+ const char *buf, size_t len)
{
struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
s_kobj);
@@ -163,6 +187,33 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
return -EINVAL;
*(bool *)ptr = !!t;
return len;
+#ifdef CONFIG_EROFS_FS_ZIP
+ case attr_drop_caches:
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t < 1 || t > 3)
+ return -EINVAL;
+
+ if (t & 2)
+ z_erofs_shrink_scan(sbi, ~0UL);
+ if (t & 1)
+ invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
+ return len;
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ case attr_accel:
+ buf = skip_spaces(buf);
+ z_erofs_crypto_disable_all_engines();
+ while (*buf) {
+ t = strcspn(buf, "\n");
+ ret = z_erofs_crypto_enable_engine(buf, t);
+ if (ret < 0)
+ return ret;
+ buf += buf[t] != '\0' ? t + 1 : t;
+ }
+ return len;
+#endif
}
return 0;
}
@@ -179,13 +230,14 @@ static const struct sysfs_ops erofs_attr_ops = {
.store = erofs_attr_store,
};
-static struct kobj_type erofs_sb_ktype = {
- .default_groups = erofs_groups,
+static const struct kobj_type erofs_sb_ktype = {
+ .default_groups = erofs_sb_groups,
.sysfs_ops = &erofs_attr_ops,
.release = erofs_sb_release,
};
-static struct kobj_type erofs_ktype = {
+static const struct kobj_type erofs_ktype = {
+ .default_groups = erofs_groups,
.sysfs_ops = &erofs_attr_ops,
};
@@ -193,7 +245,7 @@ static struct kset erofs_root = {
.kobj = {.ktype = &erofs_ktype},
};
-static struct kobj_type erofs_feat_ktype = {
+static const struct kobj_type erofs_feat_ktype = {
.default_groups = erofs_feat_groups,
.sysfs_ops = &erofs_attr_ops,
};
@@ -205,34 +257,16 @@ static struct kobject erofs_feat = {
int erofs_register_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- char *name;
- char *str = NULL;
int err;
- if (erofs_is_fscache_mode(sb)) {
- if (sbi->domain_id) {
- str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id,
- sbi->fsid);
- if (!str)
- return -ENOMEM;
- name = str;
- } else {
- name = sbi->fsid;
- }
- } else {
- name = sb->s_id;
- }
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
- kfree(str);
- if (err)
- goto put_sb_kobj;
- return 0;
-
-put_sb_kobj:
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
+ sb->s_sysfs_name);
+ if (err) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
return err;
}
@@ -247,6 +281,12 @@ void erofs_unregister_sysfs(struct super_block *sb)
}
}
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
+
int __init erofs_init_sysfs(void)
{
int ret;
@@ -254,24 +294,12 @@ int __init erofs_init_sysfs(void)
kobject_set_name(&erofs_root.kobj, "erofs");
erofs_root.kobj.parent = fs_kobj;
ret = kset_register(&erofs_root);
- if (ret)
- goto root_err;
-
- ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
- NULL, "features");
- if (ret)
- goto feat_err;
- return ret;
-
-feat_err:
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-root_err:
+ if (!ret) {
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (!ret)
+ return 0;
+ erofs_exit_sysfs();
+ }
return ret;
}
-
-void erofs_exit_sysfs(void)
-{
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-}
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
deleted file mode 100644
index 64ceb7270b5c..000000000000
--- a/fs/erofs/tagptr.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * A tagged pointer implementation
- */
-#ifndef __EROFS_FS_TAGPTR_H
-#define __EROFS_FS_TAGPTR_H
-
-#include <linux/types.h>
-#include <linux/build_bug.h>
-
-/*
- * the name of tagged pointer types are tagptr{1, 2, 3...}_t
- * avoid directly using the internal structs __tagptr{1, 2, 3...}
- */
-#define __MAKE_TAGPTR(n) \
-typedef struct __tagptr##n { \
- uintptr_t v; \
-} tagptr##n##_t;
-
-__MAKE_TAGPTR(1)
-__MAKE_TAGPTR(2)
-__MAKE_TAGPTR(3)
-__MAKE_TAGPTR(4)
-
-#undef __MAKE_TAGPTR
-
-extern void __compiletime_error("bad tagptr tags")
- __bad_tagptr_tags(void);
-
-extern void __compiletime_error("bad tagptr type")
- __bad_tagptr_type(void);
-
-/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
-#define __tagptr_mask_1(ptr, n) \
- __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
- (1UL << (n)) - 1 :
-
-#define __tagptr_mask(ptr) (\
- __tagptr_mask_1(ptr, 1) ( \
- __tagptr_mask_1(ptr, 2) ( \
- __tagptr_mask_1(ptr, 3) ( \
- __tagptr_mask_1(ptr, 4) ( \
- __bad_tagptr_type(), 0)))))
-
-/* generate a tagged pointer from a raw value */
-#define tagptr_init(type, val) \
- ((typeof(type)){ .v = (uintptr_t)(val) })
-
-/*
- * directly cast a tagged pointer to the native pointer type, which
- * could be used for backward compatibility of existing code.
- */
-#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
-
-/* encode tagged pointers */
-#define tagptr_fold(type, ptr, _tags) ({ \
- const typeof(_tags) tags = (_tags); \
- if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
- __bad_tagptr_tags(); \
-tagptr_init(type, (uintptr_t)(ptr) | tags); })
-
-/* decode tagged pointers */
-#define tagptr_unfold_ptr(tptr) \
- ((void *)((tptr).v & ~__tagptr_mask(tptr)))
-
-#define tagptr_unfold_tags(tptr) \
- ((tptr).v & __tagptr_mask(tptr))
-
-/* operations for the tagger pointer */
-#define tagptr_eq(_tptr1, _tptr2) ({ \
- typeof(_tptr1) tptr1 = (_tptr1); \
- typeof(_tptr2) tptr2 = (_tptr2); \
- (void)(&tptr1 == &tptr2); \
-(tptr1).v == (tptr2).v; })
-
-/* lock-free CAS operation */
-#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
- typeof(_ptptr) ptptr = (_ptptr); \
- typeof(_o) o = (_o); \
- typeof(_n) n = (_n); \
- (void)(&o == &n); \
- (void)(&o == ptptr); \
-tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
-
-/* wrap WRITE_ONCE if atomic update is needed */
-#define tagptr_replace_tags(_ptptr, tags) ({ \
- typeof(_ptptr) ptptr = (_ptptr); \
- *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
-*ptptr; })
-
-#define tagptr_set_tags(_ptptr, _tags) ({ \
- typeof(_ptptr) ptptr = (_ptptr); \
- const typeof(_tags) tags = (_tags); \
- if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
- __bad_tagptr_tags(); \
- ptptr->v |= tags; \
-*ptptr; })
-
-#define tagptr_clear_tags(_ptptr, _tags) ({ \
- typeof(_ptptr) ptptr = (_ptptr); \
- const typeof(_tags) tags = (_tags); \
- if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
- __bad_tagptr_tags(); \
- ptptr->v &= ~tags; \
-*ptptr; })
-
-#endif /* __EROFS_FS_TAGPTR_H */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
deleted file mode 100644
index 46627cb69abe..000000000000
--- a/fs/erofs/utils.c
+++ /dev/null
@@ -1,292 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * https://www.huawei.com/
- */
-#include "internal.h"
-#include <linux/pagevec.h>
-
-struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
-{
- struct page *page = *pagepool;
-
- if (page) {
- DBG_BUGON(page_ref_count(page) != 1);
- *pagepool = (struct page *)page_private(page);
- } else {
- page = alloc_page(gfp);
- }
- return page;
-}
-
-void erofs_release_pages(struct page **pagepool)
-{
- while (*pagepool) {
- struct page *page = *pagepool;
-
- *pagepool = (struct page *)page_private(page);
- put_page(page);
- }
-}
-
-#ifdef CONFIG_EROFS_FS_ZIP
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
-static int erofs_workgroup_get(struct erofs_workgroup *grp)
-{
- int o;
-
-repeat:
- o = erofs_wait_on_workgroup_freezed(grp);
- if (o <= 0)
- return -1;
-
- if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
- goto repeat;
-
- /* decrease refcount paired by erofs_workgroup_put */
- if (o == 1)
- atomic_long_dec(&erofs_global_shrink_cnt);
- return 0;
-}
-
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_workgroup *grp;
-
-repeat:
- rcu_read_lock();
- grp = xa_load(&sbi->managed_pslots, index);
- if (grp) {
- if (erofs_workgroup_get(grp)) {
- /* prefer to relax rcu read side */
- rcu_read_unlock();
- goto repeat;
- }
-
- DBG_BUGON(index != grp->index);
- }
- rcu_read_unlock();
- return grp;
-}
-
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct erofs_workgroup *pre;
-
- /*
- * Bump up a reference count before making this visible
- * to others for the XArray in order to avoid potential
- * UAF without serialized by xa_lock.
- */
- atomic_inc(&grp->refcount);
-
-repeat:
- xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_NOFS);
- if (pre) {
- if (xa_is_err(pre)) {
- pre = ERR_PTR(xa_err(pre));
- } else if (erofs_workgroup_get(pre)) {
- /* try to legitimize the current in-tree one */
- xa_unlock(&sbi->managed_pslots);
- cond_resched();
- goto repeat;
- }
- atomic_dec(&grp->refcount);
- grp = pre;
- }
- xa_unlock(&sbi->managed_pslots);
- return grp;
-}
-
-static void __erofs_workgroup_free(struct erofs_workgroup *grp)
-{
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
-}
-
-int erofs_workgroup_put(struct erofs_workgroup *grp)
-{
- int count = atomic_dec_return(&grp->refcount);
-
- if (count == 1)
- atomic_long_inc(&erofs_global_shrink_cnt);
- else if (!count)
- __erofs_workgroup_free(grp);
- return count;
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
-{
- /*
- * If managed cache is on, refcount of workgroups
- * themselves could be < 0 (freezed). In other words,
- * there is no guarantee that all refcounts > 0.
- */
- if (!erofs_workgroup_try_to_freeze(grp, 1))
- return false;
-
- /*
- * Note that all cached pages should be unattached
- * before deleted from the XArray. Otherwise some
- * cached pages could be still attached to the orphan
- * old workgroup when the new one is available in the tree.
- */
- if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
- erofs_workgroup_unfreeze(grp, 1);
- return false;
- }
-
- /*
- * It's impossible to fail after the workgroup is freezed,
- * however in order to avoid some race conditions, add a
- * DBG_BUGON to observe this in advance.
- */
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
-
- /* last refcount should be connected with its managed pslot. */
- erofs_workgroup_unfreeze(grp, 0);
- __erofs_workgroup_free(grp);
- return true;
-}
-
-static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink)
-{
- struct erofs_workgroup *grp;
- unsigned int freed = 0;
- unsigned long index;
-
- xa_lock(&sbi->managed_pslots);
- xa_for_each(&sbi->managed_pslots, index, grp) {
- /* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp))
- continue;
- xa_unlock(&sbi->managed_pslots);
-
- ++freed;
- if (!--nr_shrink)
- return freed;
- xa_lock(&sbi->managed_pslots);
- }
- xa_unlock(&sbi->managed_pslots);
- return freed;
-}
-
-/* protected by 'erofs_sb_list_lock' */
-static unsigned int shrinker_run_no;
-
-/* protects the mounted 'erofs_sb_list' */
-static DEFINE_SPINLOCK(erofs_sb_list_lock);
-static LIST_HEAD(erofs_sb_list);
-
-void erofs_shrinker_register(struct super_block *sb)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
-
- mutex_init(&sbi->umount_mutex);
-
- spin_lock(&erofs_sb_list_lock);
- list_add(&sbi->list, &erofs_sb_list);
- spin_unlock(&erofs_sb_list_lock);
-}
-
-void erofs_shrinker_unregister(struct super_block *sb)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
-
- mutex_lock(&sbi->umount_mutex);
- /* clean up all remaining workgroups in memory */
- erofs_shrink_workstation(sbi, ~0UL);
-
- spin_lock(&erofs_sb_list_lock);
- list_del(&sbi->list);
- spin_unlock(&erofs_sb_list_lock);
- mutex_unlock(&sbi->umount_mutex);
-}
-
-static unsigned long erofs_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- return atomic_long_read(&erofs_global_shrink_cnt);
-}
-
-static unsigned long erofs_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct erofs_sb_info *sbi;
- struct list_head *p;
-
- unsigned long nr = sc->nr_to_scan;
- unsigned int run_no;
- unsigned long freed = 0;
-
- spin_lock(&erofs_sb_list_lock);
- do {
- run_no = ++shrinker_run_no;
- } while (run_no == 0);
-
- /* Iterate over all mounted superblocks and try to shrink them */
- p = erofs_sb_list.next;
- while (p != &erofs_sb_list) {
- sbi = list_entry(p, struct erofs_sb_info, list);
-
- /*
- * We move the ones we do to the end of the list, so we stop
- * when we see one we have already done.
- */
- if (sbi->shrinker_run_no == run_no)
- break;
-
- if (!mutex_trylock(&sbi->umount_mutex)) {
- p = p->next;
- continue;
- }
-
- spin_unlock(&erofs_sb_list_lock);
- sbi->shrinker_run_no = run_no;
-
- freed += erofs_shrink_workstation(sbi, nr - freed);
-
- spin_lock(&erofs_sb_list_lock);
- /* Get the next list element before we move this one */
- p = p->next;
-
- /*
- * Move this one to the end of the list to provide some
- * fairness.
- */
- list_move_tail(&sbi->list, &erofs_sb_list);
- mutex_unlock(&sbi->umount_mutex);
-
- if (freed >= nr)
- break;
- }
- spin_unlock(&erofs_sb_list_lock);
- return freed;
-}
-
-static struct shrinker erofs_shrinker_info = {
- .scan_objects = erofs_shrink_scan,
- .count_objects = erofs_shrink_count,
- .seeks = DEFAULT_SEEKS,
-};
-
-int __init erofs_init_shrinker(void)
-{
- return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
-}
-
-void erofs_exit_shrinker(void)
-{
- unregister_shrinker(&erofs_shrinker_info);
-}
-#endif /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index a62fb8a3318a..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -5,25 +5,33 @@
* Copyright (C) 2021-2022, Alibaba Cloud
*/
#include <linux/security.h>
+#include <linux/xxhash.h>
#include "xattr.h"
-struct xattr_iter {
+struct erofs_xattr_iter {
struct super_block *sb;
struct erofs_buf buf;
+ erofs_off_t pos;
void *kaddr;
- erofs_blk_t blkaddr;
- unsigned int ofs;
+ char *buffer;
+ int buffer_size, buffer_ofs;
+
+ /* getxattr */
+ int index, infix_len;
+ struct qstr name;
+
+ /* listxattr */
+ struct dentry *dentry;
};
-static int init_inode_xattrs(struct inode *inode)
+static int erofs_init_inode_xattrs(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
- struct xattr_iter it;
+ struct erofs_xattr_iter it;
unsigned int i;
struct erofs_xattr_ibody_header *ih;
- struct super_block *sb;
- struct erofs_sb_info *sbi;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
/* the most case is that xattrs of this inode are initialized. */
@@ -52,37 +60,37 @@ static int init_inode_xattrs(struct inode *inode)
* undefined right now (maybe use later with some new sb feature).
*/
if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
- erofs_err(inode->i_sb,
+ erofs_err(sb,
"xattr_isize %d of nid %llu is not supported yet",
vi->xattr_isize, vi->nid);
ret = -EOPNOTSUPP;
goto out_unlock;
} else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
if (vi->xattr_isize) {
- erofs_err(inode->i_sb,
- "bogus xattr ibody @ nid %llu", vi->nid);
+ erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid);
DBG_BUGON(1);
ret = -EFSCORRUPTED;
goto out_unlock; /* xattr ondisk layout error */
}
- ret = -ENOATTR;
+ ret = -ENODATA;
goto out_unlock;
}
- sb = inode->i_sb;
- sbi = EROFS_SB(sb);
it.buf = __EROFS_BUF_INITIALIZER;
- it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
- it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
+ ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ goto out_unlock;
+ it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
+ ih = it.kaddr;
+ vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
@@ -93,26 +101,18 @@ static int init_inode_xattrs(struct inode *inode)
}
/* let's skip ibody header */
- it.ofs += sizeof(struct erofs_xattr_ibody_header);
+ it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- if (it.ofs >= EROFS_BLKSIZ) {
- /* cannot be unaligned */
- DBG_BUGON(it.ofs != EROFS_BLKSIZ);
-
- it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it.kaddr)) {
- kfree(vi->xattr_shared_xattrs);
- vi->xattr_shared_xattrs = NULL;
- ret = PTR_ERR(it.kaddr);
- goto out_unlock;
- }
- it.ofs = 0;
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
+ if (IS_ERR(it.kaddr)) {
+ kfree(vi->xattr_shared_xattrs);
+ vi->xattr_shared_xattrs = NULL;
+ ret = PTR_ERR(it.kaddr);
+ goto out_unlock;
}
- vi->xattr_shared_xattrs[i] =
- le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
- it.ofs += sizeof(__le32);
+ vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr);
+ it.pos += sizeof(__le32);
}
erofs_put_metabuf(&it.buf);
@@ -125,261 +125,6 @@ out_unlock:
return ret;
}
-/*
- * the general idea for these return values is
- * if 0 is returned, go on processing the current xattr;
- * 1 (> 0) is returned, skip this round to process the next xattr;
- * -err (< 0) is returned, an error (maybe ENOXATTR) occurred
- * and need to be handled
- */
-struct xattr_iter_handlers {
- int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
- int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
- unsigned int len);
- int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
- void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
- unsigned int len);
-};
-
-static inline int xattr_iter_fixup(struct xattr_iter *it)
-{
- if (it->ofs < EROFS_BLKSIZ)
- return 0;
-
- it->blkaddr += erofs_blknr(it->ofs);
- it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it->kaddr))
- return PTR_ERR(it->kaddr);
- it->ofs = erofs_blkoff(it->ofs);
- return 0;
-}
-
-static int inline_xattr_iter_begin(struct xattr_iter *it,
- struct inode *inode)
-{
- struct erofs_inode *const vi = EROFS_I(inode);
- struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
- unsigned int xattr_header_sz, inline_xattr_ofs;
-
- xattr_header_sz = inlinexattr_header_size(inode);
- if (xattr_header_sz >= vi->xattr_isize) {
- DBG_BUGON(xattr_header_sz > vi->xattr_isize);
- return -ENOATTR;
- }
-
- inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
-
- it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
- it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
-
- it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it->kaddr))
- return PTR_ERR(it->kaddr);
- return vi->xattr_isize - xattr_header_sz;
-}
-
-/*
- * Regardless of success or failure, `xattr_foreach' will end up with
- * `ofs' pointing to the next xattr item rather than an arbitrary position.
- */
-static int xattr_foreach(struct xattr_iter *it,
- const struct xattr_iter_handlers *op,
- unsigned int *tlimit)
-{
- struct erofs_xattr_entry entry;
- unsigned int value_sz, processed, slice;
- int err;
-
- /* 0. fixup blkaddr, ofs, ipage */
- err = xattr_iter_fixup(it);
- if (err)
- return err;
-
- /*
- * 1. read xattr entry to the memory,
- * since we do EROFS_XATTR_ALIGN
- * therefore entry should be in the page
- */
- entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
- if (tlimit) {
- unsigned int entry_sz = erofs_xattr_entry_size(&entry);
-
- /* xattr on-disk corruption: xattr entry beyond xattr_isize */
- if (*tlimit < entry_sz) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- *tlimit -= entry_sz;
- }
-
- it->ofs += sizeof(struct erofs_xattr_entry);
- value_sz = le16_to_cpu(entry.e_value_size);
-
- /* handle entry */
- err = op->entry(it, &entry);
- if (err) {
- it->ofs += entry.e_name_len + value_sz;
- goto out;
- }
-
- /* 2. handle xattr name (ofs will finally be at the end of name) */
- processed = 0;
-
- while (processed < entry.e_name_len) {
- if (it->ofs >= EROFS_BLKSIZ) {
- DBG_BUGON(it->ofs > EROFS_BLKSIZ);
-
- err = xattr_iter_fixup(it);
- if (err)
- goto out;
- it->ofs = 0;
- }
-
- slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
- entry.e_name_len - processed);
-
- /* handle name */
- err = op->name(it, processed, it->kaddr + it->ofs, slice);
- if (err) {
- it->ofs += entry.e_name_len - processed + value_sz;
- goto out;
- }
-
- it->ofs += slice;
- processed += slice;
- }
-
- /* 3. handle xattr value */
- processed = 0;
-
- if (op->alloc_buffer) {
- err = op->alloc_buffer(it, value_sz);
- if (err) {
- it->ofs += value_sz;
- goto out;
- }
- }
-
- while (processed < value_sz) {
- if (it->ofs >= EROFS_BLKSIZ) {
- DBG_BUGON(it->ofs > EROFS_BLKSIZ);
-
- err = xattr_iter_fixup(it);
- if (err)
- goto out;
- it->ofs = 0;
- }
-
- slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
- value_sz - processed);
- op->value(it, processed, it->kaddr + it->ofs, slice);
- it->ofs += slice;
- processed += slice;
- }
-
-out:
- /* xattrs should be 4-byte aligned (on-disk constraint) */
- it->ofs = EROFS_XATTR_ALIGN(it->ofs);
- return err < 0 ? err : 0;
-}
-
-struct getxattr_iter {
- struct xattr_iter it;
-
- char *buffer;
- int buffer_size, index;
- struct qstr name;
-};
-
-static int xattr_entrymatch(struct xattr_iter *_it,
- struct erofs_xattr_entry *entry)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
-
- return (it->index != entry->e_name_index ||
- it->name.len != entry->e_name_len) ? -ENOATTR : 0;
-}
-
-static int xattr_namematch(struct xattr_iter *_it,
- unsigned int processed, char *buf, unsigned int len)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
-
- return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0;
-}
-
-static int xattr_checkbuffer(struct xattr_iter *_it,
- unsigned int value_sz)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
- int err = it->buffer_size < value_sz ? -ERANGE : 0;
-
- it->buffer_size = value_sz;
- return !it->buffer ? 1 : err;
-}
-
-static void xattr_copyvalue(struct xattr_iter *_it,
- unsigned int processed,
- char *buf, unsigned int len)
-{
- struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
-
- memcpy(it->buffer + processed, buf, len);
-}
-
-static const struct xattr_iter_handlers find_xattr_handlers = {
- .entry = xattr_entrymatch,
- .name = xattr_namematch,
- .alloc_buffer = xattr_checkbuffer,
- .value = xattr_copyvalue
-};
-
-static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
-{
- int ret;
- unsigned int remaining;
-
- ret = inline_xattr_iter_begin(&it->it, inode);
- if (ret < 0)
- return ret;
-
- remaining = ret;
- while (remaining) {
- ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
- if (ret != -ENOATTR)
- break;
- }
- return ret ? ret : it->buffer_size;
-}
-
-static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
-{
- struct erofs_inode *const vi = EROFS_I(inode);
- struct super_block *const sb = inode->i_sb;
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- unsigned int i;
- int ret = -ENOATTR;
-
- for (i = 0; i < vi->xattr_shared_count; ++i) {
- erofs_blk_t blkaddr =
- xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
-
- it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
- it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it->it.kaddr))
- return PTR_ERR(it->it.kaddr);
- it->it.blkaddr = blkaddr;
-
- ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
- if (ret != -ENOATTR)
- break;
- }
- return ret ? ret : it->buffer_size;
-}
-
static bool erofs_xattr_user_list(struct dentry *dentry)
{
return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
@@ -390,57 +135,13 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry)
return capable(CAP_SYS_ADMIN);
}
-int erofs_getxattr(struct inode *inode, int index,
- const char *name,
- void *buffer, size_t buffer_size)
-{
- int ret;
- struct getxattr_iter it;
-
- if (!name)
- return -EINVAL;
-
- ret = init_inode_xattrs(inode);
- if (ret)
- return ret;
-
- it.index = index;
- it.name.len = strlen(name);
- if (it.name.len > EROFS_NAME_LEN)
- return -ERANGE;
-
- it.it.buf = __EROFS_BUF_INITIALIZER;
- it.name.name = name;
-
- it.buffer = buffer;
- it.buffer_size = buffer_size;
-
- it.it.sb = inode->i_sb;
- ret = inline_getxattr(inode, &it);
- if (ret == -ENOATTR)
- ret = shared_getxattr(inode, &it);
- erofs_put_metabuf(&it.it.buf);
- return ret;
-}
-
static int erofs_xattr_generic_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
{
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
- switch (handler->flags) {
- case EROFS_XATTR_INDEX_USER:
- if (!test_opt(&sbi->opt, XATTR_USER))
- return -EOPNOTSUPP;
- break;
- case EROFS_XATTR_INDEX_TRUSTED:
- break;
- case EROFS_XATTR_INDEX_SECURITY:
- break;
- default:
- return -EINVAL;
- }
+ if (handler->flags == EROFS_XATTR_INDEX_USER &&
+ !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER))
+ return -EOPNOTSUPP;
return erofs_getxattr(inode, handler->flags, name, buffer, size);
}
@@ -467,12 +168,8 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
};
#endif
-const struct xattr_handler *erofs_xattr_handlers[] = {
+const struct xattr_handler * const erofs_xattr_handlers[] = {
&erofs_xattr_user_handler,
-#ifdef CONFIG_EROFS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
&erofs_xattr_trusted_handler,
#ifdef CONFIG_EROFS_FS_SECURITY
&erofs_xattr_security_handler,
@@ -480,142 +177,351 @@ const struct xattr_handler *erofs_xattr_handlers[] = {
NULL,
};
-struct listxattr_iter {
- struct xattr_iter it;
-
- struct dentry *dentry;
- char *buffer;
- int buffer_size, buffer_ofs;
-};
-
-static int xattr_entrylist(struct xattr_iter *_it,
- struct erofs_xattr_entry *entry)
+static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
+ unsigned int len)
{
- struct listxattr_iter *it =
- container_of(_it, struct listxattr_iter, it);
- unsigned int prefix_len;
- const char *prefix;
+ unsigned int slice, processed;
+ struct super_block *sb = it->sb;
+ void *src;
+
+ for (processed = 0; processed < len; processed += slice) {
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ src = it->kaddr;
+ slice = min_t(unsigned int, sb->s_blocksize -
+ erofs_blkoff(sb, it->pos), len - processed);
+ memcpy(it->buffer + it->buffer_ofs, src, slice);
+ it->buffer_ofs += slice;
+ it->pos += slice;
+ }
+ return 0;
+}
- const struct xattr_handler *h =
- erofs_xattr_handler(entry->e_name_index);
+static int erofs_listxattr_foreach(struct erofs_xattr_iter *it)
+{
+ struct erofs_xattr_entry entry;
+ unsigned int base_index, name_total, prefix_len, infix_len = 0;
+ const char *prefix, *infix = NULL;
+ int err;
- if (!h || (h->list && !h->list(it->dentry)))
- return 1;
+ /* 1. handle xattr entry */
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
+ it->pos += sizeof(struct erofs_xattr_entry);
+
+ base_index = entry.e_name_index;
+ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) {
+ struct erofs_sb_info *sbi = EROFS_SB(it->sb);
+ struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
+ (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
+
+ if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
+ return 0;
+ infix = pf->prefix->infix;
+ infix_len = pf->infix_len;
+ base_index = pf->prefix->base_index;
+ }
- prefix = xattr_prefix(h);
+ prefix = erofs_xattr_prefix(base_index, it->dentry);
+ if (!prefix)
+ return 0;
prefix_len = strlen(prefix);
+ name_total = prefix_len + infix_len + entry.e_name_len + 1;
if (!it->buffer) {
- it->buffer_ofs += prefix_len + entry->e_name_len + 1;
- return 1;
+ it->buffer_ofs += name_total;
+ return 0;
}
- if (it->buffer_ofs + prefix_len
- + entry->e_name_len + 1 > it->buffer_size)
+ if (it->buffer_ofs + name_total > it->buffer_size)
return -ERANGE;
memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
- it->buffer_ofs += prefix_len;
- return 0;
-}
+ memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len);
+ it->buffer_ofs += prefix_len + infix_len;
-static int xattr_namelist(struct xattr_iter *_it,
- unsigned int processed, char *buf, unsigned int len)
-{
- struct listxattr_iter *it =
- container_of(_it, struct listxattr_iter, it);
+ /* 2. handle xattr name */
+ err = erofs_xattr_copy_to_buffer(it, entry.e_name_len);
+ if (err)
+ return err;
- memcpy(it->buffer + it->buffer_ofs, buf, len);
- it->buffer_ofs += len;
+ it->buffer[it->buffer_ofs++] = '\0';
return 0;
}
-static int xattr_skipvalue(struct xattr_iter *_it,
- unsigned int value_sz)
+static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
{
- struct listxattr_iter *it =
- container_of(_it, struct listxattr_iter, it);
+ struct super_block *sb = it->sb;
+ struct erofs_xattr_entry entry;
+ unsigned int slice, processed, value_sz;
- it->buffer[it->buffer_ofs++] = '\0';
- return 1;
-}
+ /* 1. handle xattr entry */
+ entry = *(struct erofs_xattr_entry *)it->kaddr;
+ it->pos += sizeof(struct erofs_xattr_entry);
+ value_sz = le16_to_cpu(entry.e_value_size);
-static const struct xattr_iter_handlers list_xattr_handlers = {
- .entry = xattr_entrylist,
- .name = xattr_namelist,
- .alloc_buffer = xattr_skipvalue,
- .value = NULL
-};
+ /* should also match the infix for long name prefixes */
+ if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) {
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes +
+ (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
+
+ if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
+ return -ENODATA;
+
+ if (it->index != pf->prefix->base_index ||
+ it->name.len != entry.e_name_len + pf->infix_len)
+ return -ENODATA;
+
+ if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
+ return -ENODATA;
-static int inline_listxattr(struct listxattr_iter *it)
+ it->infix_len = pf->infix_len;
+ } else {
+ if (it->index != entry.e_name_index ||
+ it->name.len != entry.e_name_len)
+ return -ENODATA;
+
+ it->infix_len = 0;
+ }
+
+ /* 2. handle xattr name */
+ for (processed = 0; processed < entry.e_name_len; processed += slice) {
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ slice = min_t(unsigned int,
+ sb->s_blocksize - erofs_blkoff(sb, it->pos),
+ entry.e_name_len - processed);
+ if (memcmp(it->name.name + it->infix_len + processed,
+ it->kaddr, slice))
+ return -ENODATA;
+ it->pos += slice;
+ }
+
+ /* 3. handle xattr value */
+ if (!it->buffer) {
+ it->buffer_ofs = value_sz;
+ return 0;
+ }
+
+ if (it->buffer_size < value_sz)
+ return -ERANGE;
+
+ return erofs_xattr_copy_to_buffer(it, value_sz);
+}
+
+static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
+ struct inode *inode, bool getxattr)
{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ unsigned int xattr_header_sz, remaining, entry_sz;
+ erofs_off_t next_pos;
int ret;
- unsigned int remaining;
- ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
- if (ret < 0)
+ xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) +
+ sizeof(u32) * vi->xattr_shared_count;
+ if (xattr_header_sz >= vi->xattr_isize) {
+ DBG_BUGON(xattr_header_sz > vi->xattr_isize);
+ return -ENODATA;
+ }
+
+ ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode));
+ if (ret)
return ret;
+ remaining = vi->xattr_isize - xattr_header_sz;
+ it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
- remaining = ret;
while (remaining) {
- ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
- if (ret)
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ entry_sz = erofs_xattr_entry_size(it->kaddr);
+ /* xattr on-disk corruption: xattr entry beyond xattr_isize */
+ if (remaining < entry_sz) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ remaining -= entry_sz;
+ next_pos = it->pos + entry_sz;
+
+ if (getxattr)
+ ret = erofs_getxattr_foreach(it);
+ else
+ ret = erofs_listxattr_foreach(it);
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
+
+ it->pos = next_pos;
}
- return ret ? ret : it->buffer_ofs;
+ return ret;
}
-static int shared_listxattr(struct listxattr_iter *it)
+static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
+ struct inode *inode, bool getxattr)
{
- struct inode *const inode = d_inode(it->dentry);
struct erofs_inode *const vi = EROFS_I(inode);
- struct super_block *const sb = inode->i_sb;
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- unsigned int i;
- int ret = 0;
+ struct super_block *const sb = it->sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int i = 0;
+ int ret;
- for (i = 0; i < vi->xattr_shared_count; ++i) {
- erofs_blk_t blkaddr =
- xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
-
- it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
- it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP);
- if (IS_ERR(it->it.kaddr))
- return PTR_ERR(it->it.kaddr);
- it->it.blkaddr = blkaddr;
-
- ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
- if (ret)
+ ret = erofs_init_metabuf(&it->buf, sb,
+ erofs_sb_has_shared_ea_in_metabox(sbi));
+ if (ret)
+ return ret;
+
+ while (i < vi->xattr_shared_count) {
+ it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
+ vi->xattr_shared_xattrs[i++] * sizeof(__le32);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
+
+ if (getxattr)
+ ret = erofs_getxattr_foreach(it);
+ else
+ ret = erofs_listxattr_foreach(it);
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
}
- return ret ? ret : it->buffer_ofs;
+ return i ? ret : -ENODATA;
}
-ssize_t erofs_listxattr(struct dentry *dentry,
- char *buffer, size_t buffer_size)
+int erofs_getxattr(struct inode *inode, int index, const char *name,
+ void *buffer, size_t buffer_size)
{
int ret;
- struct listxattr_iter it;
+ unsigned int hashbit;
+ struct erofs_xattr_iter it;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+
+ if (!name)
+ return -EINVAL;
+
+ ret = erofs_init_inode_xattrs(inode);
+ if (ret)
+ return ret;
+
+ /* reserved flag is non-zero if there's any change of on-disk format */
+ if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) {
+ hashbit = xxh32(name, strlen(name),
+ EROFS_XATTR_FILTER_SEED + index);
+ hashbit &= EROFS_XATTR_FILTER_BITS - 1;
+ if (vi->xattr_name_filter & (1U << hashbit))
+ return -ENODATA;
+ }
+
+ it.index = index;
+ it.name = QSTR(name);
+ if (it.name.len > EROFS_NAME_LEN)
+ return -ERANGE;
- ret = init_inode_xattrs(d_inode(dentry));
- if (ret == -ENOATTR)
+ it.sb = inode->i_sb;
+ it.buf = __EROFS_BUF_INITIALIZER;
+ it.buffer = buffer;
+ it.buffer_size = buffer_size;
+ it.buffer_ofs = 0;
+
+ ret = erofs_xattr_iter_inline(&it, inode, true);
+ if (ret == -ENODATA)
+ ret = erofs_xattr_iter_shared(&it, inode, true);
+ erofs_put_metabuf(&it.buf);
+ return ret ? ret : it.buffer_ofs;
+}
+
+ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ int ret;
+ struct erofs_xattr_iter it;
+ struct inode *inode = d_inode(dentry);
+
+ ret = erofs_init_inode_xattrs(inode);
+ if (ret == -ENODATA)
return 0;
if (ret)
return ret;
- it.it.buf = __EROFS_BUF_INITIALIZER;
+ it.sb = dentry->d_sb;
+ it.buf = __EROFS_BUF_INITIALIZER;
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
- it.it.sb = dentry->d_sb;
+ ret = erofs_xattr_iter_inline(&it, inode, false);
+ if (!ret || ret == -ENODATA)
+ ret = erofs_xattr_iter_shared(&it, inode, false);
+ if (ret == -ENODATA)
+ ret = 0;
+ erofs_put_metabuf(&it.buf);
+ return ret ? ret : it.buffer_ofs;
+}
- ret = inline_listxattr(&it);
- if (ret >= 0 || ret == -ENOATTR)
- ret = shared_listxattr(&it);
- erofs_put_metabuf(&it.it.buf);
+void erofs_xattr_prefixes_cleanup(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ int i;
+
+ if (sbi->xattr_prefixes) {
+ for (i = 0; i < sbi->xattr_prefix_count; i++)
+ kfree(sbi->xattr_prefixes[i].prefix);
+ kfree(sbi->xattr_prefixes);
+ sbi->xattr_prefixes = NULL;
+ }
+}
+
+int erofs_xattr_prefixes_init(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
+ struct erofs_xattr_prefix_item *pfs;
+ int ret = 0, i, len;
+ bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
+
+ if (!sbi->xattr_prefix_count)
+ return 0;
+
+ pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL);
+ if (!pfs)
+ return -ENOMEM;
+
+ if (!plain) {
+ if (erofs_sb_has_metabox(sbi))
+ (void)erofs_init_metabuf(&buf, sb, true);
+ else if (sbi->packed_inode)
+ buf.mapping = sbi->packed_inode->i_mapping;
+ else
+ plain = true;
+ }
+ if (plain)
+ (void)erofs_init_metabuf(&buf, sb, false);
+
+ for (i = 0; i < sbi->xattr_prefix_count; i++) {
+ void *ptr = erofs_read_metadata(sb, &buf, &pos, &len);
+
+ if (IS_ERR(ptr)) {
+ ret = PTR_ERR(ptr);
+ break;
+ } else if (len < sizeof(*pfs->prefix) ||
+ len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) {
+ kfree(ptr);
+ ret = -EFSCORRUPTED;
+ break;
+ }
+ pfs[i].prefix = ptr;
+ pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix);
+ }
+
+ erofs_put_metabuf(&buf);
+ sbi->xattr_prefixes = pfs;
+ if (ret)
+ erofs_xattr_prefixes_cleanup(sb);
return ret;
}
@@ -648,7 +554,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
rc = erofs_getxattr(inode, prefix, "", value, rc);
}
- if (rc == -ENOATTR)
+ if (rc == -ENODATA)
acl = NULL;
else if (rc < 0)
acl = ERR_PTR(rc);
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 0a43c9ee9f8f..6317caa8413e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -10,46 +10,21 @@
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
-/* Attribute not found */
-#define ENOATTR ENODATA
-
-static inline unsigned int inlinexattr_header_size(struct inode *inode)
-{
- return sizeof(struct erofs_xattr_ibody_header) +
- sizeof(u32) * EROFS_I(inode)->xattr_shared_count;
-}
-
-static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
- unsigned int xattr_id)
-{
-#ifdef CONFIG_EROFS_FS_XATTR
- return sbi->xattr_blkaddr +
- xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
-#else
- return 0;
-#endif
-}
-
-static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
- unsigned int xattr_id)
-{
- return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
-}
-
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
extern const struct xattr_handler erofs_xattr_security_handler;
-static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
+static inline const char *erofs_xattr_prefix(unsigned int idx,
+ struct dentry *dentry)
{
- static const struct xattr_handler *xattr_handler_map[] = {
+ const struct xattr_handler *handler = NULL;
+
+ static const struct xattr_handler * const xattr_handler_map[] = {
[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] =
- &posix_acl_access_xattr_handler,
- [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &posix_acl_default_xattr_handler,
+ [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
+ [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
#endif
[EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
#ifdef CONFIG_EROFS_FS_SECURITY
@@ -57,15 +32,24 @@ static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
#endif
};
- return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
- xattr_handler_map[idx] : NULL;
+ if (idx && idx < ARRAY_SIZE(xattr_handler_map))
+ handler = xattr_handler_map[idx];
+
+ if (!xattr_handler_can_list(handler, dentry))
+ return NULL;
+
+ return xattr_prefix(handler);
}
-extern const struct xattr_handler *erofs_xattr_handlers[];
+extern const struct xattr_handler * const erofs_xattr_handlers[];
+int erofs_xattr_prefixes_init(struct super_block *sb);
+void erofs_xattr_prefixes_cleanup(struct super_block *sb);
int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
ssize_t erofs_listxattr(struct dentry *, char *, size_t);
#else
+static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; }
+static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {}
static inline int erofs_getxattr(struct inode *inode, int index,
const char *name, void *buffer,
size_t buffer_size)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 5200bb86e264..65da21504632 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -4,13 +4,116 @@
* https://www.huawei.com/
* Copyright (C) 2022 Alibaba Cloud
*/
-#include "zdata.h"
#include "compress.h"
-#include <linux/prefetch.h>
#include <linux/psi.h>
-
+#include <linux/cpuhotplug.h>
#include <trace/events/erofs.h>
+#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
+#define Z_EROFS_INLINE_BVECS 2
+
+struct z_erofs_bvec {
+ struct page *page;
+ int offset;
+ unsigned int end;
+};
+
+#define __Z_EROFS_BVSET(name, total) \
+struct name { \
+ /* point to the next page which contains the following bvecs */ \
+ struct page *nextpage; \
+ struct z_erofs_bvec bvec[total]; \
+}
+__Z_EROFS_BVSET(z_erofs_bvset,);
+__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ * for everyone else;
+ *
+ * L: Field should be protected by the pcluster lock;
+ *
+ * A: Field should be accessed / updated in atomic for parallelized code.
+ */
+struct z_erofs_pcluster {
+ struct mutex lock;
+ struct lockref lockref;
+
+ /* A: point to next chained pcluster or TAILs */
+ struct z_erofs_pcluster *next;
+
+ /* I: start physical position of this pcluster */
+ erofs_off_t pos;
+
+ /* L: the maximum decompression size of this round */
+ unsigned int length;
+
+ /* L: total number of bvecs */
+ unsigned int vcnt;
+
+ /* I: pcluster size (compressed size) in bytes */
+ unsigned int pclustersize;
+
+ /* I: page offset of start position of decompression */
+ unsigned short pageofs_out;
+
+ /* I: page offset of inline compressed data */
+ unsigned short pageofs_in;
+
+ union {
+ /* L: inline a certain number of bvec for bootstrap */
+ struct z_erofs_bvset_inline bvset;
+
+ /* I: can be used to free the pcluster by RCU. */
+ struct rcu_head rcu;
+ };
+
+ /* I: compression algorithm format */
+ unsigned char algorithmformat;
+
+ /* I: whether compressed data is in-lined or not */
+ bool from_meta;
+
+ /* L: whether partial decompression or not */
+ bool partial;
+
+ /* L: whether extra buffer allocations are best-effort */
+ bool besteffort;
+
+ /* A: compressed bvecs (can be cached or inplaced pages) */
+ struct z_erofs_bvec compressed_bvecs[];
+};
+
+/* the end of a chain of pclusters */
+#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
+
+struct z_erofs_decompressqueue {
+ struct super_block *sb;
+ struct z_erofs_pcluster *head;
+ atomic_t pending_bios;
+
+ union {
+ struct completion done;
+ struct work_struct work;
+ struct kthread_work kthread_work;
+ } u;
+ bool eio, sync;
+};
+
+static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
+{
+ return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT;
+}
+
+static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
+{
+ return fo->mapping == MNGD_MAPPING(sbi);
+}
+
+#define Z_EROFS_ONSTACK_PAGES 32
+
/*
* since pclustersize is variable for big pcluster feature, introduce slab
* pools implementation for different pcluster sizes.
@@ -25,7 +128,7 @@ struct z_erofs_pcluster_slab {
static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
_PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
- _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1)
};
struct z_erofs_bvec_iter {
@@ -76,14 +179,21 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct z_erofs_bvec *bvec,
- struct page **candidate_bvpage)
+ struct page **candidate_bvpage,
+ struct page **pagepool)
{
- if (iter->cur == iter->nr) {
- if (!*candidate_bvpage)
- return -EAGAIN;
-
+ if (iter->cur >= iter->nr) {
+ struct page *nextpage = *candidate_bvpage;
+
+ if (!nextpage) {
+ nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
+ true);
+ if (!nextpage)
+ return -ENOMEM;
+ set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
+ }
DBG_BUGON(iter->bvset->nextpage);
- iter->bvset->nextpage = *candidate_bvpage;
+ iter->bvset->nextpage = nextpage;
z_erofs_bvset_flip(iter);
iter->bvset->nextpage = NULL;
@@ -138,21 +248,20 @@ static int z_erofs_create_pcluster_pool(void)
return 0;
}
-static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
{
- int i;
+ unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool;
- for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
- struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+ for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
struct z_erofs_pcluster *pcl;
if (nrpages > pcs->maxpages)
continue;
- pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
- pcl->pclusterpages = nrpages;
return pcl;
}
return ERR_PTR(-EINVAL);
@@ -175,121 +284,242 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
-/*
- * tagged pointer with 1-bit tag for all compressed pages
- * tag 0 - the page is just found with an extra page reference
- */
-typedef tagptr1_t compressed_page_t;
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-#define tag_compressed_page_justfound(page) \
- tagptr_fold(compressed_page_t, page, 1)
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+static struct kthread_worker __rcu **z_erofs_pcpu_workers;
+static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);
-static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static void erofs_destroy_percpu_workers(void)
+{
+ struct kthread_worker *worker;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ worker = rcu_dereference_protected(
+ z_erofs_pcpu_workers[cpu], 1);
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
+ if (worker)
+ kthread_destroy_worker(worker);
+ }
+ kfree(z_erofs_pcpu_workers);
+}
-void z_erofs_exit_zip_subsystem(void)
+static struct kthread_worker *erofs_init_percpu_worker(int cpu)
{
- destroy_workqueue(z_erofs_workqueue);
- z_erofs_destroy_pcluster_pool();
+ struct kthread_worker *worker =
+ kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
+
+ if (IS_ERR(worker))
+ return worker;
+ if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
+ sched_set_fifo_low(worker->task);
+ return worker;
}
-static inline int z_erofs_init_workqueue(void)
+static int erofs_init_percpu_workers(void)
{
- const unsigned int onlinecpus = num_possible_cpus();
+ struct kthread_worker *worker;
+ unsigned int cpu;
- /*
- * no need to spawn too many threads, limiting threads could minimum
- * scheduling overhead, perhaps per-CPU threads should be better?
- */
- z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
- WQ_UNBOUND | WQ_HIGHPRI,
- onlinecpus + onlinecpus / 4);
- return z_erofs_workqueue ? 0 : -ENOMEM;
+ z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
+ sizeof(struct kthread_worker *), GFP_ATOMIC);
+ if (!z_erofs_pcpu_workers)
+ return -ENOMEM;
+
+ for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */
+ worker = erofs_init_percpu_worker(cpu);
+ if (!IS_ERR(worker))
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
+static enum cpuhp_state erofs_cpuhp_state;
+
+static int erofs_cpu_online(unsigned int cpu)
+{
+ struct kthread_worker *worker, *old;
+
+ worker = erofs_init_percpu_worker(cpu);
+ if (IS_ERR(worker))
+ return PTR_ERR(worker);
+
+ spin_lock(&z_erofs_pcpu_worker_lock);
+ old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
+ lockdep_is_held(&z_erofs_pcpu_worker_lock));
+ if (!old)
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
+ spin_unlock(&z_erofs_pcpu_worker_lock);
+ if (old)
+ kthread_destroy_worker(worker);
+ return 0;
+}
+
+static int erofs_cpu_offline(unsigned int cpu)
+{
+ struct kthread_worker *worker;
+
+ spin_lock(&z_erofs_pcpu_worker_lock);
+ worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
+ lockdep_is_held(&z_erofs_pcpu_worker_lock));
+ rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
+ spin_unlock(&z_erofs_pcpu_worker_lock);
+
+ synchronize_rcu();
+ if (worker)
+ kthread_destroy_worker(worker);
+ return 0;
+}
+
+static int erofs_cpu_hotplug_init(void)
+{
+ int state;
+
+ state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
+ if (state < 0)
+ return state;
+
+ erofs_cpuhp_state = state;
+ return 0;
+}
+
+static void erofs_cpu_hotplug_destroy(void)
+{
+ if (erofs_cpuhp_state)
+ cpuhp_remove_state_nocalls(erofs_cpuhp_state);
}
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline int erofs_cpu_hotplug_init(void) { return 0; }
+static inline void erofs_cpu_hotplug_destroy(void) {}
+#endif/* CONFIG_HOTPLUG_CPU */
+static int z_erofs_init_pcpu_workers(struct super_block *sb)
+{
+ int err;
-int __init z_erofs_init_zip_subsystem(void)
+ if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
+ return 0;
+
+ err = erofs_init_percpu_workers();
+ if (err) {
+ erofs_err(sb, "per-cpu workers: failed to allocate.");
+ goto err_init_percpu_workers;
+ }
+
+ err = erofs_cpu_hotplug_init();
+ if (err < 0) {
+ erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
+ goto err_cpuhp_init;
+ }
+ erofs_info(sb, "initialized per-cpu workers successfully.");
+ return err;
+
+err_cpuhp_init:
+ erofs_destroy_percpu_workers();
+err_init_percpu_workers:
+ atomic_set(&erofs_percpu_workers_initialized, 0);
+ return err;
+}
+
+static void z_erofs_destroy_pcpu_workers(void)
+{
+ if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
+ return;
+ erofs_cpu_hotplug_destroy();
+ erofs_destroy_percpu_workers();
+}
+#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
+static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
+static inline void z_erofs_destroy_pcpu_workers(void) {}
+#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */
+
+void z_erofs_exit_subsystem(void)
{
- int err = z_erofs_create_pcluster_pool();
+ z_erofs_destroy_pcpu_workers();
+ destroy_workqueue(z_erofs_workqueue);
+ z_erofs_destroy_pcluster_pool();
+ z_erofs_crypto_disable_all_engines();
+ z_erofs_exit_decompressor();
+}
+
+int __init z_erofs_init_subsystem(void)
+{
+ int err = z_erofs_init_decompressor();
if (err)
- return err;
- err = z_erofs_init_workqueue();
+ goto err_decompressor;
+
+ err = z_erofs_create_pcluster_pool();
if (err)
- z_erofs_destroy_pcluster_pool();
+ goto err_pcluster_pool;
+
+ z_erofs_workqueue = alloc_workqueue("erofs_worker",
+ WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
+ if (!z_erofs_workqueue) {
+ err = -ENOMEM;
+ goto err_workqueue_init;
+ }
+
+ return err;
+
+err_workqueue_init:
+ z_erofs_destroy_pcluster_pool();
+err_pcluster_pool:
+ z_erofs_exit_decompressor();
+err_decompressor:
return err;
}
enum z_erofs_pclustermode {
+ /* It has previously been linked into another processing chain */
Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * The current pclusters was the tail of an exist chain, in addition
- * that the previous processed chained pclusters are all decided to
- * be hooked up to it.
- * A new chain will be created for the remaining pclusters which are
- * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
- * the next pcluster cannot reuse the whole page safely for inplace I/O
- * in the following scenario:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (belongs to the next pcl) | (belongs to the current pcl) |
- * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
- */
- Z_EROFS_PCLUSTER_HOOKED,
- /*
- * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
- * could be dispatched into bypass queue later due to uptodated managed
- * pages. All related online pages cannot be reused for inplace I/O (or
- * bvpage) since it can be directly decoded without I/O submission.
+ * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
+ * may be dispatched to the bypass queue later due to uptodated managed
+ * folios. All file-backed folios related to this pcluster cannot be
+ * reused for in-place I/O (or bvpage) since the pcluster may be decoded
+ * in a separate queue (and thus out of order).
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
- * The current collection has been linked with the owned chain, and
- * could also be linked with the remaining collections, which means
- * if the processing page is the tail page of the collection, thus
- * the current collection can safely use the whole page (since
- * the previous collection is under control) for in-place I/O, as
- * illustrated below:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (of the current cl) | (of the previous collection) |
- * | PCLUSTER_FOLLOWED or | |
- * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
- *
- * [ (*) the above page can be used as inplace I/O. ]
+ * The pcluster has just been linked to our processing chain.
+ * File-backed folios (except for the head page) related to it can be
+ * used for in-place I/O (or bvpage).
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
-struct z_erofs_decompress_frontend {
+struct z_erofs_frontend {
struct inode *const inode;
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
+ struct page *pagepool;
struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl, *tailpcl;
- z_erofs_next_pcluster_t owned_head;
+ struct z_erofs_pcluster *pcl, *head;
enum z_erofs_pclustermode mode;
- bool readahead;
- /* used for applying cache strategy on the fly */
- bool backmost;
erofs_off_t headoffset;
/* a pointer used to pick up inplace I/O pages */
unsigned int icur;
};
-#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
+#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
+ .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
-static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
{
unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
- if (fe->backmost)
+ if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
return true;
if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
@@ -299,223 +529,237 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
- bool standalone = true;
- /*
- * optimistic allocation without direct reclaim since inplace I/O
- * can be used if low memory otherwise.
- */
+ pgoff_t poff = pcl->pos >> PAGE_SHIFT;
+ bool may_bypass = true;
+ /* Optimistic allocation, as in-place I/O can be used as a fallback */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ struct folio *folio, *newfolio;
unsigned int i;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
- for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page;
- compressed_page_t t;
- struct page *newpage = NULL;
-
- /* the compressed page was loaded before */
+ for (i = 0; i < pclusterpages; ++i) {
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, pcl->obj.index + i);
-
- if (page) {
- t = tag_compressed_page_justfound(page);
- } else {
- /* I/O is needed, no possible to decompress directly */
- standalone = false;
+ folio = filemap_get_folio(mc, poff + i);
+ if (IS_ERR(folio)) {
+ may_bypass = false;
if (!shouldalloc)
continue;
/*
- * try to use cached I/O if page allocation
- * succeeds or fallback to in-place I/O instead
- * to avoid any direct reclaim.
+ * Allocate a managed folio for cached I/O, or it may be
+ * then filled with a file-backed folio for in-place I/O
*/
- newpage = erofs_allocpage(pagepool, gfp);
- if (!newpage)
+ newfolio = filemap_alloc_folio(gfp, 0, NULL);
+ if (!newfolio)
continue;
- set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
- t = tag_compressed_page_justfound(newpage);
+ newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
+ folio = NULL;
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
- tagptr_cast_ptr(t)))
+ spin_lock(&pcl->lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page =
+ folio_page(folio ?: newfolio, 0);
+ spin_unlock(&pcl->lockref.lock);
continue;
-
- if (page)
- put_page(page);
- else if (newpage)
- erofs_pagepool_add(pagepool, newpage);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ folio_put(folio ?: newfolio);
}
/*
- * don't do inplace I/O if all compressed pages are available in
- * managed cache since it can be moved to the bypass queue instead.
+ * Don't perform in-place I/O if all compressed pages are available in
+ * the managed cache, as the pcluster can be moved to the bypass queue.
*/
- if (standalone)
+ if (may_bypass)
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
-/* called by erofs_shrinker to get rid of all compressed_pages */
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+/* (erofs_shrinker) disconnect cached encoded data with pclusters */
+static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ struct folio *folio;
int i;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- /*
- * refcount of workgroup is now freezed as 1,
- * therefore no need to worry about available decompression users.
- */
- for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page = pcl->compressed_bvecs[i].page;
-
- if (!page)
- continue;
-
- /* block other users from reclaiming or migrating the page */
- if (!trylock_page(page))
- return -EBUSY;
-
- if (!erofs_page_is_managed(sbi, page))
- continue;
+ DBG_BUGON(pcl->from_meta);
+ /* Each cached folio contains one page unless bs > ps is supported */
+ for (i = 0; i < pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].page) {
+ folio = page_folio(pcl->compressed_bvecs[i].page);
+ /* Avoid reclaiming or migrating this folio */
+ if (!folio_trylock(folio))
+ return -EBUSY;
- /* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- detach_page_private(page);
- unlock_page(page);
+ if (!erofs_folio_is_managed(sbi, folio))
+ continue;
+ pcl->compressed_bvecs[i].page = NULL;
+ folio_detach_private(folio);
+ folio_unlock(folio);
+ }
}
return 0;
}
-int erofs_try_to_free_cached_page(struct page *page)
+static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
- struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- int ret, i;
+ struct z_erofs_pcluster *pcl = folio_get_private(folio);
+ struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
+ struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
+ bool ret;
- if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
- return 0;
+ if (!folio_test_private(folio))
+ return true;
- ret = 0;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pcl->pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].page == page) {
- WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- ret = 1;
- break;
+ ret = false;
+ spin_lock(&pcl->lockref.lock);
+ if (pcl->lockref.count <= 0) {
+ DBG_BUGON(pcl->from_meta);
+ for (; bvec < end; ++bvec) {
+ if (bvec->page && page_folio(bvec->page) == folio) {
+ bvec->page = NULL;
+ folio_detach_private(folio);
+ ret = true;
+ break;
+ }
}
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
- if (ret)
- detach_page_private(page);
+ spin_unlock(&pcl->lockref.lock);
return ret;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * An extra lock could be introduced instead but it seems unnecessary.
+ */
+static void z_erofs_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- struct z_erofs_pcluster *const pcl = fe->pcl;
+ const size_t stop = length + offset;
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
+ /* Check for potential overflow in debug mode */
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
+
+ if (offset == 0 && stop == folio_size(folio))
+ while (!z_erofs_cache_release_folio(folio, 0))
+ cond_resched();
+}
+
+static const struct address_space_operations z_erofs_cache_aops = {
+ .release_folio = z_erofs_cache_release_folio,
+ .invalidate_folio = z_erofs_cache_invalidate_folio,
+};
+
+int z_erofs_init_super(struct super_block *sb)
+{
+ struct inode *inode;
+ int err;
+
+ err = z_erofs_init_pcpu_workers(sb);
+ if (err)
+ return err;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return -ENOMEM;
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &z_erofs_cache_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+ EROFS_SB(sb)->managed_cache = inode;
+ xa_init(&EROFS_SB(sb)->managed_pslots);
+ return 0;
}
/* callers must be with pcluster lock held */
-static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+static int z_erofs_attach_page(struct z_erofs_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
- /* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
- return 0;
+ /* Inplace I/O is limited to one page for uncompressed data */
+ if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX ||
+ fe->icur <= 1) {
+ /* Try to prioritize inplace I/O here */
+ spin_lock(&pcl->lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->lockref.lock);
+ return 0;
+ }
+ spin_unlock(&pcl->lockref.lock);
+ }
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
fe->candidate_bvpage = bvec->page;
}
- ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
+ &fe->pagepool);
fe->pcl->vcnt += (ret >= 0);
return ret;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
+static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *pcl = f->pcl;
- z_erofs_next_pcluster_t *owned_head = &f->owned_head;
-
- /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
- *owned_head) == Z_EROFS_PCLUSTER_NIL) {
- *owned_head = &pcl->next;
- /* so we can attach this pcluster to our submission chain. */
- f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
- return;
- }
+ if (lockref_get_not_zero(&pcl->lockref))
+ return true;
- /*
- * type 2, link to the end of an existing open chain, be careful
- * that its submission is controlled by the original attached chain.
- */
- if (*owned_head != &pcl->next && pcl != f->tailpcl &&
- cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
- *owned_head = Z_EROFS_PCLUSTER_TAIL;
- f->mode = Z_EROFS_PCLUSTER_HOOKED;
- f->tailpcl = NULL;
- return;
+ spin_lock(&pcl->lockref.lock);
+ if (__lockref_is_dead(&pcl->lockref)) {
+ spin_unlock(&pcl->lockref.lock);
+ return false;
}
- /* type 3, it belongs to a chain, but it isn't the end of the chain */
- f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+
+ if (!pcl->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&pcl->lockref.lock);
+ return true;
}
-static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
- bool ztailpacking = map->m_flags & EROFS_MAP_META;
- struct z_erofs_pcluster *pcl;
- struct erofs_workgroup *grp;
+ struct super_block *sb = fe->inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct z_erofs_pcluster *pcl, *pre;
+ unsigned int pageofs_in;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED) ||
- (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- /* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
- map->m_plen >> PAGE_SHIFT);
+ pageofs_in = erofs_blkoff(sb, map->m_pa);
+ pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- atomic_set(&pcl->obj.refcount, 1);
+ lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
+ pcl->pclustersize = map->m_plen;
pcl->length = 0;
pcl->partial = true;
-
- /* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = fe->owned_head;
+ pcl->next = fe->head;
+ pcl->pos = map->m_pa;
+ pcl->pageofs_in = pageofs_in;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ pcl->from_meta = map->m_flags & EROFS_MAP_META;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
@@ -525,31 +769,29 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
mutex_init(&pcl->lock);
DBG_BUGON(!mutex_trylock(&pcl->lock));
- if (ztailpacking) {
- pcl->obj.index = 0; /* which indicates ztailpacking */
- pcl->pageofs_in = erofs_blkoff(map->m_pa);
- pcl->tailpacking_size = map->m_plen;
- } else {
- pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
- grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
+ if (!pcl->from_meta) {
+ while (1) {
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos,
+ NULL, pcl, GFP_KERNEL);
+ if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
+ xa_unlock(&sbi->managed_pslots);
+ break;
+ }
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
}
-
- if (grp != &pcl->obj) {
- fe->pcl = container_of(grp,
- struct z_erofs_pcluster, obj);
+ if (xa_is_err(pre)) {
+ err = xa_err(pre);
+ goto err_out;
+ } else if (pre) {
+ fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
}
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = pcl;
- fe->owned_head = &pcl->next;
- fe->pcl = pcl;
+ fe->head = fe->pcl = pcl;
return 0;
err_out:
@@ -558,28 +800,33 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
- struct erofs_workgroup *grp = NULL;
+ struct super_block *sb = fe->inode->i_sb;
+ struct z_erofs_pcluster *pcl = NULL;
+ void *ptr;
int ret;
DBG_BUGON(fe->pcl);
-
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(!fe->head);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(fe->inode->i_sb,
- map->m_pa >> PAGE_SHIFT);
- } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ while (1) {
+ rcu_read_lock();
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa);
+ if (!pcl || z_erofs_get_pcluster(pcl)) {
+ DBG_BUGON(pcl && map->m_pa != pcl->pos);
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ }
}
- if (grp) {
- fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl) {
+ fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@@ -587,249 +834,268 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = fe->pcl;
-
- z_erofs_try_to_claim_pcluster(fe);
+ /* check if this pcluster hasn't been linked into any chain. */
+ if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
+ /* .. so it can be attached to our submission chain */
+ fe->head = fe->pcl;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+ } else { /* otherwise, it belongs to an inflight chain */
+ fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+ }
} else if (ret) {
return ret;
}
+
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- /* since file-backed online pages are traversed in reverse order */
+ if (!fe->pcl->from_meta) {
+ /* bind cache first when cached decompression is preferred */
+ z_erofs_bind_cache(fe);
+ } else {
+ ret = erofs_init_metabuf(&map->buf, sb,
+ erofs_inode_in_metabox(fe->inode));
+ if (ret)
+ return ret;
+ ptr = erofs_bread(&map->buf, map->m_pa, false);
+ if (IS_ERR(ptr)) {
+ ret = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inline folio %d", ret);
+ return ret;
+ }
+ folio_get(page_folio(map->buf.page));
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+ fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+ }
+ /* file-backed inplace I/O pages are traversed in reverse order */
fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
-/*
- * keep in mind that no referenced pclusters will be freed
- * only after a RCU grace period.
- */
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- z_erofs_free_pcluster(container_of(head,
- struct z_erofs_pcluster, rcu));
+ z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
}
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl->lockref.count)
+ return false;
+
+ /*
+ * Note that all cached folios should be detached before deleted from
+ * the XArray. Otherwise some folios could be still attached to the
+ * orphan old pcluster when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_folios(sbi, pcl))
+ return false;
- call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ /*
+ * It's impossible to fail after the pcluster is freezed, but in order
+ * to avoid some race conditions, add a DBG_BUGON to observe this.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl);
+
+ lockref_mark_dead(&pcl->lockref);
+ return true;
}
-static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
+static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ bool free;
+
+ spin_lock(&pcl->lockref.lock);
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ spin_unlock(&pcl->lockref.lock);
+ if (free) {
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ }
+ return free;
+}
+
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
+{
+ struct z_erofs_pcluster *pcl;
+ unsigned long index, freed = 0;
+
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, pcl) {
+ /* try to shrink each valid pcluster */
+ if (!erofs_try_to_release_pcluster(sbi, pcl))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
+}
+
+static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl, bool try_free)
+{
+ bool free = false;
+
+ if (lockref_put_or_lock(&pcl->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&pcl->lockref));
+ if (!--pcl->lockref.count) {
+ if (try_free && xa_trylock(&sbi->managed_pslots)) {
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ xa_unlock(&sbi->managed_pslots);
+ }
+ atomic_long_add(!free, &erofs_global_shrink_cnt);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ if (free)
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+}
+
+static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
if (!pcl)
- return false;
+ return;
z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
- if (fe->candidate_bvpage) {
- DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+ if (fe->candidate_bvpage)
fe->candidate_bvpage = NULL;
- }
- /*
- * if all pending pages are added, don't hold its reference
- * any longer if the pcluster isn't hosted by ourselves.
- */
+ /* Drop refcount if it doesn't belong to our processing chain */
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
- erofs_workgroup_put(&pcl->obj);
-
+ z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
- return true;
}
-static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
- struct page *page, unsigned int pageofs,
- unsigned int len)
+static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
+ unsigned int cur, unsigned int end, erofs_off_t pos)
{
- struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+ struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u8 *src, *dst;
- unsigned int i, cnt;
+ unsigned int cnt;
+ u8 *src;
if (!packed_inode)
return -EFSCORRUPTED;
- pos += EROFS_I(inode)->z_fragmentoff;
- for (i = 0; i < len; i += cnt) {
- cnt = min_t(unsigned int, len - i,
- EROFS_BLKSIZ - erofs_blkoff(pos));
- src = erofs_bread(&buf, packed_inode,
- erofs_blknr(pos), EROFS_KMAP);
+ buf.mapping = packed_inode->i_mapping;
+ for (; cur < end; cur += cnt, pos += cnt) {
+ cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
+ src = erofs_bread(&buf, pos, true);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
-
- dst = kmap_local_page(page);
- memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
- kunmap_local(dst);
- pos += cnt;
+ memcpy_to_folio(folio, cur, src, cnt);
}
erofs_put_metabuf(&buf);
return 0;
}
-static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct page **pagepool)
+static int z_erofs_scan_folio(struct z_erofs_frontend *f,
+ struct folio *folio, bool ra)
{
- struct inode *const inode = fe->inode;
- struct erofs_map_blocks *const map = &fe->map;
- const loff_t offset = page_offset(page);
- bool tight = true, exclusive;
- unsigned int cur, end, spiltted;
+ struct inode *const inode = f->inode;
+ struct erofs_map_blocks *const map = &f->map;
+ const loff_t offset = folio_pos(folio);
+ const unsigned int bs = i_blocksize(inode);
+ unsigned int end = folio_size(folio), split = 0, cur, pgs;
+ bool tight, excl;
int err = 0;
- /* register locked file pages as online pages in pack */
- z_erofs_onlinepage_init(page);
-
- spiltted = 0;
- end = PAGE_SIZE;
-repeat:
- cur = end - 1;
-
- if (offset + cur < map->m_la ||
- offset + cur >= map->m_la + map->m_llen) {
- erofs_dbg("out-of-range map @ pos %llu", offset + cur);
-
- if (z_erofs_collector_end(fe))
- fe->backmost = false;
- map->m_la = offset + cur;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto out;
- } else {
- if (fe->pcl)
- goto hitted;
- /* didn't get a valid pcluster previously (very rare) */
- }
-
- if (!(map->m_flags & EROFS_MAP_MAPPED) ||
- map->m_flags & EROFS_MAP_FRAGMENT)
- goto hitted;
-
- err = z_erofs_collector_begin(fe);
- if (err)
- goto out;
-
- if (z_erofs_is_inline_pcluster(fe->pcl)) {
- void *mp;
-
- mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
- erofs_blknr(map->m_pa), EROFS_NO_KMAP);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
- erofs_err(inode->i_sb,
- "failed to get inline page, err %d", err);
- goto out;
+ tight = (bs == PAGE_SIZE);
+ erofs_onlinefolio_init(folio);
+ do {
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(f);
+ map->m_la = offset + end - 1;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ break;
}
- get_page(fe->map.buf.page);
- WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
- fe->map.buf.page);
- fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
- } else {
- /* bind cache first when cached decompression is preferred */
- z_erofs_bind_cache(fe, pagepool);
- }
-hitted:
- /*
- * Ensure the current partial page belongs to this submit chain rather
- * than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or bvpage (should be processed in a strict order.)
- */
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
- fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- cur = end - min_t(unsigned int, offset + end - map->m_la, end);
- if (!(map->m_flags & EROFS_MAP_MAPPED)) {
- zero_user_segment(page, cur, end);
- goto next_part;
- }
- if (map->m_flags & EROFS_MAP_FRAGMENT) {
- unsigned int pageofs, skip, len;
-
- if (offset > map->m_la) {
- pageofs = 0;
- skip = offset - map->m_la;
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ pgs = round_down(cur, PAGE_SIZE);
+ /* bump split parts first to avoid several separate cases */
+ ++split;
+
+ if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, end);
+ tight = false;
+ } else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
+ erofs_off_t fpos = offset + cur - map->m_la;
+
+ err = z_erofs_read_fragment(inode->i_sb, folio, cur,
+ cur + min(map->m_llen - fpos, end - cur),
+ EROFS_I(inode)->z_fragmentoff + fpos);
+ if (err)
+ break;
+ tight = false;
} else {
- pageofs = map->m_la & ~PAGE_MASK;
- skip = 0;
- }
- len = min_t(unsigned int, map->m_llen - skip, end - cur);
- err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
- if (err)
- goto out;
- ++spiltted;
- tight = false;
- goto next_part;
- }
-
- exclusive = (!cur && (!spiltted || tight));
- if (cur)
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
-
-retry:
- err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
- .page = page,
- .offset = offset - map->m_la,
- .end = end,
- }), exclusive);
- /* should allocate an additional short-lived page for bvset */
- if (err == -EAGAIN && !fe->candidate_bvpage) {
- fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
- set_page_private(fe->candidate_bvpage,
- Z_EROFS_SHORTLIVED_PAGE);
- goto retry;
- }
-
- if (err) {
- DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
- goto out;
- }
-
- z_erofs_onlinepage_split(page);
- /* bump up the number of spiltted parts of a page */
- ++spiltted;
- if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- fe->pcl->multibases = true;
- if (fe->pcl->length < offset + end - map->m_la) {
- fe->pcl->length = offset + end - map->m_la;
- fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
- }
- if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
- !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
- fe->pcl->length == map->m_llen)
- fe->pcl->partial = false;
-next_part:
- /* shorten the remaining extent to update progress */
- map->m_llen = offset + cur - map->m_la;
- map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
-
- end = cur;
- if (end > 0)
- goto repeat;
+ if (!f->pcl) {
+ err = z_erofs_pcluster_begin(f);
+ if (err)
+ break;
+ f->pcl->besteffort |= !ra;
+ }
-out:
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ pgs = round_down(end - 1, PAGE_SIZE);
+ /*
+ * Ensure this partial page belongs to this submit chain
+ * rather than other concurrent submit chains or
+ * noio(bypass) chains since those chains are handled
+ * asynchronously thus it cannot be used for inplace I/O
+ * or bvpage (should be processed in the strict order.)
+ */
+ tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
+ excl = false;
+ if (cur <= pgs) {
+ excl = (split <= 1) || tight;
+ cur = pgs;
+ }
- erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
- __func__, page, spiltted, map->m_llen);
+ err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
+ .page = folio_page(folio, pgs >> PAGE_SHIFT),
+ .offset = offset + pgs - map->m_la,
+ .end = end - pgs, }), excl);
+ if (err)
+ break;
+
+ erofs_onlinefolio_split(folio);
+ if (f->pcl->length < offset + end - map->m_la) {
+ f->pcl->length = offset + end - map->m_la;
+ f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ f->pcl->length == map->m_llen)
+ f->pcl->partial = false;
+ }
+ /* shorten the remaining extent to update progress */
+ map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
+ if (cur <= pgs) {
+ split = cur < pgs;
+ tight = (bs == PAGE_SIZE);
+ }
+ } while ((end = cur) > 0);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
-static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
+static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
unsigned int readahead_pages)
{
/* auto: enable for read_folio, disable for readahead */
@@ -846,14 +1112,13 @@ static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
static bool z_erofs_page_is_invalidated(struct page *page)
{
- return !page->mapping && !z_erofs_is_shortlived_page(page);
+ return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
}
-struct z_erofs_decompress_backend {
+struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
struct z_erofs_pcluster *pcl;
-
/* pages with the longest decompressed length for deduplication */
struct page **decompressed_pages;
/* pages to keep the compressed data */
@@ -862,6 +1127,8 @@ struct z_erofs_decompress_backend {
struct list_head decompressed_secondary_bvecs;
struct page **pagepool;
unsigned int onstack_used, nr_pages;
+ /* indicate if temporary copies should be preserved for later use */
+ bool keepxcpy;
};
struct z_erofs_bvec_item {
@@ -869,20 +1136,23 @@ struct z_erofs_bvec_item {
struct list_head list;
};
-static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
struct z_erofs_bvec *bvec)
{
+ int poff = bvec->offset + be->pcl->pageofs_out;
struct z_erofs_bvec_item *item;
-
- if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
- unsigned int pgnr;
-
- pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
- DBG_BUGON(pgnr >= be->nr_pages);
- if (!be->decompressed_pages[pgnr]) {
- be->decompressed_pages[pgnr] = bvec->page;
+ struct page **page;
+
+ if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
+ DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages);
+ page = be->decompressed_pages + (poff >> PAGE_SHIFT);
+ if (!*page) {
+ *page = bvec->page;
return;
}
+ } else {
+ be->keepxcpy = true;
}
/* (cold path) one pcluster is requested multiple times */
@@ -891,8 +1161,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
list_add(&item->list, &be->decompressed_secondary_bvecs);
}
-static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
- int err)
+static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
{
unsigned int off0 = be->pcl->pageofs_out;
struct list_head *p, *n;
@@ -927,15 +1196,13 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- if (err)
- z_erofs_page_mark_eio(bvi->bvec.page);
- z_erofs_onlinepage_endio(bvi->bvec.page);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
list_del(p);
kfree(bvi);
}
}
-static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
{
struct z_erofs_pcluster *pcl = be->pcl;
struct z_erofs_bvec_iter biter;
@@ -960,8 +1227,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
}
-static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
- bool *overlapped)
+static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
{
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
@@ -972,46 +1238,42 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
struct page *page = bvec->page;
- /* compressed pages ought to be present before decompressing */
- if (!page) {
- DBG_BUGON(1);
+ /* compressed data ought to be valid when decompressing */
+ if (IS_ERR(page) || !page) {
+ bvec->page = NULL; /* clear the failure reason */
+ err = page ? PTR_ERR(page) : -EIO;
continue;
}
be->compressed_pages[i] = page;
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (pcl->from_meta ||
+ erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
continue;
}
DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (!z_erofs_is_shortlived_page(page)) {
- if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
- if (!PageUptodate(page))
- err = -EIO;
- continue;
- }
- z_erofs_do_decompressed_bvec(be, bvec);
- *overlapped = true;
- }
+ if (z_erofs_is_shortlived_page(page))
+ continue;
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
}
-
- if (err)
- return err;
- return 0;
+ return err;
}
-static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
- int err)
+static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
{
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
- unsigned int i, inputsize;
- int err2;
+ const struct z_erofs_decompressor *alg =
+ z_erofs_decomp[pcl->algorithmformat];
+ bool try_free = true;
+ int i, j, jtop, err2;
struct page *page;
bool overlapped;
+ const char *reason;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1032,115 +1294,133 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
if (!be->decompressed_pages)
be->decompressed_pages =
- kcalloc(be->nr_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_NOFAIL);
+ kvcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
if (!be->compressed_pages)
be->compressed_pages =
- kcalloc(pclusterpages, sizeof(struct page *),
- GFP_KERNEL | __GFP_NOFAIL);
+ kvcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
z_erofs_parse_out_bvecs(be);
err2 = z_erofs_parse_in_bvecs(be, &overlapped);
if (err2)
err = err2;
- if (err)
- goto out;
-
- if (z_erofs_is_inline_pcluster(pcl))
- inputsize = pcl->tailpacking_size;
- else
- inputsize = pclusterpages * PAGE_SIZE;
-
- err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+ if (!err) {
+ reason = alg->decompress(&(struct z_erofs_decompress_req) {
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
+ .inpages = pclusterpages,
+ .outpages = be->nr_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
- .inputsize = inputsize,
+ .inputsize = pcl->pclustersize,
.outputsize = pcl->length,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
- .fillgaps = pcl->multibases,
+ .fillgaps = be->keepxcpy,
+ .gfp = pcl->besteffort ? GFP_KERNEL :
+ GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
+ if (IS_ERR(reason)) {
+ erofs_err(be->sb, "failed to decompress (%s) %ld @ pa %llu size %u => %u",
+ alg->name, PTR_ERR(reason), pcl->pos,
+ pcl->pclustersize, pcl->length);
+ err = PTR_ERR(reason);
+ } else if (unlikely(reason)) {
+ erofs_err(be->sb, "failed to decompress (%s) %s @ pa %llu size %u => %u",
+ alg->name, reason, pcl->pos,
+ pcl->pclustersize, pcl->length);
+ err = -EFSCORRUPTED;
+ }
+ }
-out:
/* must handle all compressed pages before actual file pages */
- if (z_erofs_is_inline_pcluster(pcl)) {
- page = pcl->compressed_bvecs[0].page;
+ if (pcl->from_meta) {
+ folio_put(page_folio(pcl->compressed_bvecs[0].page));
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
- put_page(page);
} else {
+ /* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
- page = pcl->compressed_bvecs[i].page;
-
- if (erofs_page_is_managed(sbi, page))
+ page = be->compressed_pages[i];
+ if (!page)
continue;
-
- /* recycle all individual short-lived pages */
+ if (erofs_folio_is_managed(sbi, page_folio(page))) {
+ try_free = false;
+ continue;
+ }
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
}
if (be->compressed_pages < be->onstack_pages ||
be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
- kfree(be->compressed_pages);
- z_erofs_fill_other_copies(be, err);
+ kvfree(be->compressed_pages);
+ jtop = 0;
+ z_erofs_fill_other_copies(be, err);
for (i = 0; i < be->nr_pages; ++i) {
page = be->decompressed_pages[i];
if (!page)
continue;
DBG_BUGON(z_erofs_page_is_invalidated(page));
-
- /* recycle all individual short-lived pages */
- if (z_erofs_put_shortlivedpage(be->pagepool, page))
+ if (!z_erofs_is_shortlived_page(page)) {
+ erofs_onlinefolio_end(page_folio(page), err, true);
+ continue;
+ }
+ if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
+ erofs_pagepool_add(be->pagepool, page);
continue;
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ }
+ for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
+ ;
+ if (j >= jtop) /* this bounce page is newly detected */
+ be->decompressed_pages[jtop++] = page;
}
-
+ while (jtop)
+ erofs_pagepool_add(be->pagepool,
+ be->decompressed_pages[--jtop]);
if (be->decompressed_pages != be->onstack_pages)
- kfree(be->decompressed_pages);
+ kvfree(be->decompressed_pages);
pcl->length = 0;
pcl->partial = true;
- pcl->multibases = false;
+ pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
/* pcluster lock MUST be taken before the following line */
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+ WRITE_ONCE(pcl->next, NULL);
mutex_unlock(&pcl->lock);
+
+ if (pcl->from_meta)
+ z_erofs_free_pcluster(pcl);
+ else
+ z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
-static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct page **pagepool)
+static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct page **pagepool)
{
- struct z_erofs_decompress_backend be = {
+ struct z_erofs_backend be = {
.sb = io->sb,
.pagepool = pagepool,
.decompressed_secondary_bvecs =
LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ .pcl = io->head,
};
- z_erofs_next_pcluster_t owned = io->head;
+ struct z_erofs_pcluster *next;
+ int err = io->eio ? -EIO : 0;
- while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
- /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
-
- be.pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(be.pcl->next);
-
- z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
- erofs_workgroup_put(&be.pcl->obj);
+ for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
+ DBG_BUGON(!be.pcl);
+ next = READ_ONCE(be.pcl->next);
+ err = z_erofs_decompress_pcluster(&be, err) ?: err;
}
+ return err;
}
static void z_erofs_decompressqueue_work(struct work_struct *work)
@@ -1149,20 +1429,36 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
container_of(work, struct z_erofs_decompressqueue, u.work);
struct page *pagepool = NULL;
- DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
z_erofs_decompress_queue(bgq, &pagepool);
-
erofs_release_pages(&pagepool);
kvfree(bgq);
}
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
+{
+ z_erofs_decompressqueue_work((struct work_struct *)work);
+}
+#endif
+
+/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */
+static inline bool z_erofs_in_atomic(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+ return true;
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return true;
+ return !preemptible();
+}
+
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
- bool sync, int bios)
+ int bios)
{
struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
/* wake up the caller thread for sync decompression */
- if (sync) {
+ if (io->sync) {
if (!atomic_add_return(bios, &io->pending_bios))
complete(&io->u.done);
return;
@@ -1170,9 +1466,23 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (atomic_add_return(bios, &io->pending_bios))
return;
- /* Use workqueue and sync decompression for atomic contexts only */
- if (in_atomic() || irqs_disabled()) {
+ if (z_erofs_in_atomic()) {
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+ struct kthread_worker *worker;
+
+ rcu_read_lock();
+ worker = rcu_dereference(
+ z_erofs_pcpu_workers[raw_smp_processor_id()]);
+ if (!worker) {
+ INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
+ queue_work(z_erofs_workqueue, &io->u.work);
+ } else {
+ kthread_queue_work(worker, &io->u.kthread_work);
+ }
+ rcu_read_unlock();
+#else
queue_work(z_erofs_workqueue, &io->u.work);
+#endif
/* enable sync decompression for readahead */
if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
@@ -1181,122 +1491,112 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
z_erofs_decompressqueue_work(&io->u.work);
}
-static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
- unsigned int nr,
- struct page **pagepool,
- struct address_space *mc)
+static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
+ struct z_erofs_frontend *f,
+ struct z_erofs_pcluster *pcl,
+ unsigned int nr,
+ struct address_space *mc)
{
- const pgoff_t index = pcl->obj.index;
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
-
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *oldpage, *page;
-
- compressed_page_t t;
- int justfound;
+ struct folio *folio;
+ struct page *page;
+ int bs = i_blocksize(f->inode);
+ /* Except for inplace folios, the entire folio can be used for I/Os */
+ bvec->bv_offset = 0;
+ bvec->bv_len = PAGE_SIZE;
repeat:
- page = READ_ONCE(pcl->compressed_bvecs[nr].page);
- oldpage = page;
-
- if (!page)
- goto out_allocpage;
-
- /* process the target tagged pointer */
- t = tagptr_init(compressed_page_t, page);
- justfound = tagptr_unfold_tags(t);
- page = tagptr_unfold_ptr(t);
-
- /*
- * preallocated cached pages, which is used to avoid direct reclaim
- * otherwise, it will go inplace I/O path instead.
- */
- if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
- set_page_private(page, 0);
+ spin_lock(&pcl->lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ spin_unlock(&pcl->lockref.lock);
+ if (!zbv.page)
+ goto out_allocfolio;
+
+ bvec->bv_page = zbv.page;
+ DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
+
+ folio = page_folio(zbv.page);
+ /* For preallocated managed folios, add them to page cache here */
+ if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) {
tocache = true;
goto out_tocache;
}
- mapping = READ_ONCE(page->mapping);
+ mapping = READ_ONCE(folio->mapping);
/*
- * file-backed online pages in plcuster are all locked steady,
- * therefore it is impossible for `mapping' to be NULL.
+ * File-backed folios for inplace I/Os are all locked steady,
+ * therefore it is impossible for `mapping` to be NULL.
*/
- if (mapping && mapping != mc)
- /* ought to be unmanaged pages */
- goto out;
-
- /* directly return for shortlived page as well */
- if (z_erofs_is_shortlived_page(page))
- goto out;
-
- lock_page(page);
-
- /* only true if page reclaim goes wrong, should never happen */
- DBG_BUGON(justfound && PagePrivate(page));
-
- /* the page is still in manage cache */
- if (page->mapping == mc) {
- WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
-
- if (!PagePrivate(page)) {
- /*
- * impossible to be !PagePrivate(page) for
- * the current restriction as well if
- * the page is already in compressed_bvecs[].
- */
- DBG_BUGON(!justfound);
+ if (mapping && mapping != mc) {
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
+ return;
+ }
- justfound = 0;
- set_page_private(page, (unsigned long)pcl);
- SetPagePrivate(page);
+ folio_lock(folio);
+ if (likely(folio->mapping == mc)) {
+ /*
+ * The cached folio is still in managed cache but without
+ * a valid `->private` pcluster hint. Let's reconnect them.
+ */
+ if (!folio_test_private(folio)) {
+ folio_attach_private(folio, pcl);
+ /* compressed_bvecs[] already takes a ref before */
+ folio_put(folio);
}
-
- /* no need to submit io if it is already up-to-date */
- if (PageUptodate(page)) {
- unlock_page(page);
- page = NULL;
+ if (likely(folio->private == pcl)) {
+ /* don't submit cache I/Os again if already uptodate */
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ bvec->bv_page = NULL;
+ }
+ return;
}
- goto out;
+ /*
+ * Already linked with another pcluster, which only appears in
+ * crafted images by fuzzers for now. But handle this anyway.
+ */
+ tocache = false; /* use temporary short-lived pages */
+ } else {
+ DBG_BUGON(1); /* referenced managed folios can't be truncated */
+ tocache = true;
}
-
- /*
- * the managed page has been truncated, it's unsafe to
- * reuse this one, let's allocate a new cache-managed page.
- */
- DBG_BUGON(page->mapping);
- DBG_BUGON(!justfound);
-
- tocache = true;
- unlock_page(page);
- put_page(page);
-out_allocpage:
- page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
- oldpage, page)) {
- erofs_pagepool_add(pagepool, page);
+ folio_unlock(folio);
+ folio_put(folio);
+out_allocfolio:
+ page = __erofs_allocpage(&f->pagepool, gfp, true);
+ spin_lock(&pcl->lockref.lock);
+ if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+ if (page)
+ erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
+ spin_unlock(&pcl->lockref.lock);
+ bvec->bv_page = page;
+ if (!page)
+ return;
+ folio = page_folio(page);
out_tocache:
- if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
- /* turn into temporary page if fails (1 ref) */
- set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
- goto out;
+ if (!tocache || bs != PAGE_SIZE ||
+ filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) {
+ /* turn into a temporary shortlived folio (1 ref) */
+ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+ return;
}
- attach_page_private(page, pcl);
- /* drop a refcount added by allocpage (then we have 2 refs here) */
- put_page(page);
-
-out: /* the only exit (for tracing and debugging) */
- return page;
+ folio_attach_private(folio, pcl);
+ /* drop a refcount added by allocpage (then 2 refs in total here) */
+ folio_put(folio);
}
-static struct z_erofs_decompressqueue *
-jobqueue_init(struct super_block *sb,
- struct z_erofs_decompressqueue *fgq, bool *fg)
+static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
+ struct z_erofs_decompressqueue *fgq, bool *fg)
{
struct z_erofs_decompressqueue *q;
@@ -1306,16 +1606,22 @@ jobqueue_init(struct super_block *sb,
*fg = true;
goto fg_out;
}
+#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
+ kthread_init_work(&q->u.kthread_work,
+ z_erofs_decompressqueue_kthread_work);
+#else
INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+#endif
} else {
fg_out:
q = fgq;
init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
q->eio = false;
+ q->sync = true;
}
q->sb = sb;
- q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ q->head = Z_EROFS_PCLUSTER_TAIL;
return q;
}
@@ -1326,133 +1632,102 @@ enum {
NR_JOBQUEUES,
};
-static void *jobqueueset_init(struct super_block *sb,
- struct z_erofs_decompressqueue *q[],
- struct z_erofs_decompressqueue *fgq, bool *fg)
+static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
+ struct z_erofs_pcluster *next,
+ struct z_erofs_pcluster **qtail[])
{
- /*
- * if managed cache is enabled, bypass jobqueue is needed,
- * no need to read from device for all pclusters in this queue.
- */
- q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
- q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg);
-
- return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg));
-}
-
-static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
- z_erofs_next_pcluster_t qtail[],
- z_erofs_next_pcluster_t owned_head)
-{
- z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
- z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
-
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (owned_head == Z_EROFS_PCLUSTER_TAIL)
- owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
-
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
-
- WRITE_ONCE(*submit_qtail, owned_head);
- WRITE_ONCE(*bypass_qtail, &pcl->next);
-
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
+ WRITE_ONCE(*qtail[JQ_SUBMIT], next);
+ WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
qtail[JQ_BYPASS] = &pcl->next;
}
-static void z_erofs_decompressqueue_endio(struct bio *bio)
+static void z_erofs_endio(struct bio *bio)
{
- tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
- struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
+ struct z_erofs_decompressqueue *q = bio->bi_private;
blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(z_erofs_page_is_invalidated(page));
+ DBG_BUGON(folio_test_uptodate(folio));
+ DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
+ if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
+ continue;
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
+ if (!err)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
if (err)
q->eio = true;
- z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
- bio_put(bio);
+ z_erofs_decompress_kickoff(q, -1);
+ if (bio->bi_bdev)
+ bio_put(bio);
}
-static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool,
+static void z_erofs_submit_queue(struct z_erofs_frontend *f,
struct z_erofs_decompressqueue *fgq,
- bool *force_fg)
+ bool *force_fg, bool readahead)
{
struct super_block *sb = f->inode->i_sb;
struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
- z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+ struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
- void *bi_private;
- z_erofs_next_pcluster_t owned_head = f->owned_head;
+ struct z_erofs_pcluster *pcl, *next;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
- pgoff_t last_index;
- struct block_device *last_bdev;
+ erofs_off_t last_pa;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
unsigned long pflags;
int memstall = 0;
- bi_private = jobqueueset_init(sb, q, fgq, force_fg);
+ /* No need to read from device for pclusters in the bypass queue. */
+ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
+ q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
+
qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
/* by default, all need io submission */
- q[JQ_SUBMIT]->head = owned_head;
+ q[JQ_SUBMIT]->head = next = f->head;
do {
struct erofs_map_dev mdev;
- struct z_erofs_pcluster *pcl;
- pgoff_t cur, end;
+ erofs_off_t cur, end;
+ struct bio_vec bvec;
unsigned int i = 0;
bool bypass = true;
- /* no possible 'owned_head' equals the following */
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-
- pcl = container_of(owned_head, struct z_erofs_pcluster, next);
-
- /* close the main owned chain at first */
- owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (z_erofs_is_inline_pcluster(pcl)) {
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ pcl = next;
+ next = READ_ONCE(pcl->next);
+ if (pcl->from_meta) {
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
continue;
}
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = blknr_to_addr(pcl->obj.index),
+ .m_pa = round_down(pcl->pos, sb->s_blocksize),
};
(void)erofs_map_dev(sb, &mdev);
- cur = erofs_blknr(mdev.m_pa);
- end = cur + pcl->pclusterpages;
-
+ cur = mdev.m_pa;
+ end = round_up(cur + pcl->pageofs_in + pcl->pclustersize,
+ sb->s_blocksize);
do {
- struct page *page;
-
- page = pickup_page_for_submission(pcl, i++, pagepool,
- mc);
- if (!page)
- continue;
+ bvec.bv_page = NULL;
+ if (bio && (cur != last_pa ||
+ bio->bi_bdev != mdev.m_bdev)) {
+drain_io:
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
+ erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
- if (bio && (cur != last_index + 1 ||
- last_bdev != mdev.m_bdev)) {
-submit_bio_retry:
- submit_bio(bio);
if (memstall) {
psi_memstall_leave(&pflags);
memstall = 0;
@@ -1460,132 +1735,144 @@ submit_bio_retry:
bio = NULL;
}
- if (unlikely(PageWorkingset(page)) && !memstall) {
+ if (!bvec.bv_page) {
+ z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+ if (!bvec.bv_page)
+ continue;
+ if (cur + bvec.bv_len > end)
+ bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+ }
+
+ if (unlikely(PageWorkingset(bvec.bv_page)) &&
+ !memstall) {
psi_memstall_enter(&pflags);
memstall = 1;
}
if (!bio) {
- bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
- REQ_OP_READ, GFP_NOIO);
- bio->bi_end_io = z_erofs_decompressqueue_endio;
-
- last_bdev = mdev.m_bdev;
- bio->bi_iter.bi_sector = (sector_t)cur <<
- LOG_SECTORS_PER_BLOCK;
- bio->bi_private = bi_private;
- if (f->readahead)
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ bio = erofs_fileio_bio_alloc(&mdev);
+ else if (erofs_is_fscache_mode(sb))
+ bio = erofs_fscache_bio_alloc(&mdev);
+ else
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
+ bio->bi_end_io = z_erofs_endio;
+ bio->bi_iter.bi_sector =
+ (mdev.m_dif->fsoff + cur) >> 9;
+ bio->bi_private = q[JQ_SUBMIT];
+ if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
- goto submit_bio_retry;
-
- last_index = cur;
+ if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
+ bvec.bv_offset))
+ goto drain_io;
+ last_pa = cur + bvec.bv_len;
bypass = false;
- } while (++cur < end);
+ } while ((cur += bvec.bv_len) < end);
if (!bypass)
qtail[JQ_SUBMIT] = &pcl->next;
else
- move_to_bypass_jobqueue(pcl, qtail, owned_head);
- } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+ z_erofs_move_to_bypass_queue(pcl, next, qtail);
+ } while (next != Z_EROFS_PCLUSTER_TAIL);
if (bio) {
- submit_bio(bio);
- if (memstall)
- psi_memstall_leave(&pflags);
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
+ erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
}
+ if (memstall)
+ psi_memstall_leave(&pflags);
/*
* although background is preferred, no one is pending for submission.
- * don't issue workqueue for decompression but drop it directly instead.
+ * don't issue decompression but drop it directly instead.
*/
if (!*force_fg && !nr_bios) {
kvfree(q[JQ_SUBMIT]);
return;
}
- z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
+ z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
}
-static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool, bool force_fg)
+static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
+ struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
+ bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
+ int err;
- if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
- return;
- z_erofs_submit_queue(f, pagepool, io, &force_fg);
+ if (f->head == Z_EROFS_PCLUSTER_TAIL)
+ return 0;
+ z_erofs_submit_queue(f, io, &force_fg, !!rapages);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
-
+ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
- return;
+ return err;
/* wait until all bios are completed */
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+ return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
}
/*
* Since partial uptodate is still unimplemented for now, we have to use
* approximate readmore strategies as a start.
*/
-static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
- struct readahead_control *rac,
- erofs_off_t end,
- struct page **pagepool,
- bool backmost)
+static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
+ struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
struct erofs_map_blocks *map = &f->map;
- erofs_off_t cur;
+ erofs_off_t cur, end, headoffset = f->headoffset;
int err;
if (backmost) {
+ if (rac)
+ end = headoffset + readahead_length(rac) - 1;
+ else
+ end = headoffset + PAGE_SIZE - 1;
map->m_la = end;
err = z_erofs_map_blocks_iter(inode, map,
EROFS_GET_BLOCKS_READMORE);
- if (err)
+ if (err || !(map->m_flags & EROFS_MAP_ENCODED))
return;
- /* expend ra for the trailing edge if readahead */
+ /* expand ra for the trailing edge if readahead */
if (rac) {
- loff_t newstart = readahead_pos(rac);
-
cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
- readahead_expand(rac, newstart, cur - newstart);
+ readahead_expand(rac, headoffset, cur - headoffset);
return;
}
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
-
- if (!map->m_llen)
+ if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen)
return;
}
cur = map->m_la + map->m_llen - 1;
- while (cur >= end) {
+ while ((cur >= end) && (cur < i_size_read(inode))) {
pgoff_t index = cur >> PAGE_SHIFT;
- struct page *page;
-
- page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
- if (page) {
- if (PageUptodate(page)) {
- unlock_page(page);
- } else {
- err = z_erofs_do_read_page(f, page, pagepool);
- if (err)
- erofs_err(inode->i_sb,
- "readmore error at page %lu @ nid %llu",
- index, EROFS_I(inode)->nid);
- }
- put_page(page);
+ struct folio *folio;
+
+ folio = erofs_grab_folio_nowait(inode->i_mapping, index);
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_uptodate(folio))
+ folio_unlock(folio);
+ else
+ z_erofs_scan_folio(f, folio, !!rac);
+ folio_put(folio);
}
if (cur < PAGE_SIZE)
@@ -1596,77 +1883,58 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *const inode = page->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL;
+ struct inode *const inode = folio->mapping->host;
+ Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
int err;
- trace_erofs_readpage(page, false);
- f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
-
- z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
- &pagepool, true);
- err = z_erofs_do_read_page(&f, page, &pagepool);
- z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
-
- (void)z_erofs_collector_end(&f);
-
- /* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_get_sync_decompress_policy(sbi, 0));
+ trace_erofs_read_folio(folio, false);
+ z_erofs_pcluster_readmore(&f, NULL, true);
+ err = z_erofs_scan_folio(&f, folio, false);
+ z_erofs_pcluster_readmore(&f, NULL, false);
+ z_erofs_pcluster_end(&f);
- if (err)
- erofs_err(inode->i_sb, "failed to read, err [%d]", err);
+ /* if some pclusters are ready, need submit them anyway */
+ err = z_erofs_runqueue(&f, 0) ?: err;
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
+ err, folio->index, EROFS_I(inode)->nid);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
return err;
}
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL, *head = NULL, *page;
- unsigned int nr_pages;
-
- f.readahead = true;
- f.headoffset = readahead_pos(rac);
-
- z_erofs_pcluster_readmore(&f, rac, f.headoffset +
- readahead_length(rac) - 1, &pagepool, true);
- nr_pages = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
-
- while ((page = readahead_page(rac))) {
- set_page_private(page, (unsigned long)head);
- head = page;
+ Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
+ unsigned int nrpages = readahead_count(rac);
+ struct folio *head = NULL, *folio;
+ int err;
+
+ trace_erofs_readahead(inode, readahead_index(rac), nrpages, false);
+ z_erofs_pcluster_readmore(&f, rac, true);
+ while ((folio = readahead_folio(rac))) {
+ folio->private = head;
+ head = folio;
}
+ /* traverse in reverse order for best metadata I/O performance */
while (head) {
- struct page *page = head;
- int err;
-
- /* traversal in reverse order */
- head = (void *)page_private(page);
-
- err = z_erofs_do_read_page(&f, page, &pagepool);
- if (err)
- erofs_err(inode->i_sb,
- "readahead error at page %lu @ nid %llu",
- page->index, EROFS_I(inode)->nid);
- put_page(page);
+ folio = head;
+ head = folio_get_private(folio);
+
+ err = z_erofs_scan_folio(&f, folio, true);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
}
- z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f);
+ z_erofs_pcluster_readmore(&f, rac, false);
+ z_erofs_pcluster_end(&f);
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_get_sync_decompress_policy(sbi, nr_pages));
+ (void)z_erofs_runqueue(&f, nrpages);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
deleted file mode 100644
index d98c95212985..000000000000
--- a/fs/erofs/zdata.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * https://www.huawei.com/
- */
-#ifndef __EROFS_FS_ZDATA_H
-#define __EROFS_FS_ZDATA_H
-
-#include "internal.h"
-#include "tagptr.h"
-
-#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
-#define Z_EROFS_INLINE_BVECS 2
-
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
-struct z_erofs_bvec {
- struct page *page;
- int offset;
- unsigned int end;
-};
-
-#define __Z_EROFS_BVSET(name, total) \
-struct name { \
- /* point to the next page which contains the following bvecs */ \
- struct page *nextpage; \
- struct z_erofs_bvec bvec[total]; \
-}
-__Z_EROFS_BVSET(z_erofs_bvset,);
-__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
-
-/*
- * Structure fields follow one of the following exclusion rules.
- *
- * I: Modifiable by initialization/destruction paths and read-only
- * for everyone else;
- *
- * L: Field should be protected by the pcluster lock;
- *
- * A: Field should be accessed / updated in atomic for parallelized code.
- */
-struct z_erofs_pcluster {
- struct erofs_workgroup obj;
- struct mutex lock;
-
- /* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
-
- /* L: the maximum decompression size of this round */
- unsigned int length;
-
- /* L: total number of bvecs */
- unsigned int vcnt;
-
- /* I: page offset of start position of decompression */
- unsigned short pageofs_out;
-
- /* I: page offset of inline compressed data */
- unsigned short pageofs_in;
-
- union {
- /* L: inline a certain number of bvec for bootstrap */
- struct z_erofs_bvset_inline bvset;
-
- /* I: can be used to free the pcluster by RCU. */
- struct rcu_head rcu;
- };
-
- union {
- /* I: physical cluster size in pages */
- unsigned short pclusterpages;
-
- /* I: tailpacking inline compressed size */
- unsigned short tailpacking_size;
- };
-
- /* I: compression algorithm format */
- unsigned char algorithmformat;
-
- /* L: whether partial decompression or not */
- bool partial;
-
- /* L: indicate several pageofs_outs or not */
- bool multibases;
-
- /* A: compressed bvecs (can be cached or inplaced pages) */
- struct z_erofs_bvec compressed_bvecs[];
-};
-
-/* let's avoid the valid 32-bit kernel addresses */
-
-/* the chained workgroup has't submitted io (still open) */
-#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
-/* the chained workgroup has already submitted io */
-#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
-
-#define Z_EROFS_PCLUSTER_NIL (NULL)
-
-struct z_erofs_decompressqueue {
- struct super_block *sb;
- atomic_t pending_bios;
- z_erofs_next_pcluster_t head;
-
- union {
- struct completion done;
- struct work_struct work;
- } u;
-
- bool eio;
-};
-
-static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
-{
- return !pcl->obj.index;
-}
-
-static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
-{
- if (z_erofs_is_inline_pcluster(pcl))
- return 1;
- return pcl->pclusterpages;
-}
-
-/*
- * bit 30: I/O error occurred on this page
- * bit 0 - 29: remaining parts to complete this page
- */
-#define Z_EROFS_PAGE_EIO (1 << 30)
-
-static inline void z_erofs_onlinepage_init(struct page *page)
-{
- union {
- atomic_t o;
- unsigned long v;
- } u = { .o = ATOMIC_INIT(1) };
-
- set_page_private(page, u.v);
- smp_wmb();
- SetPagePrivate(page);
-}
-
-static inline void z_erofs_onlinepage_split(struct page *page)
-{
- atomic_inc((atomic_t *)&page->private);
-}
-
-static inline void z_erofs_page_mark_eio(struct page *page)
-{
- int orig;
-
- do {
- orig = atomic_read((atomic_t *)&page->private);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
- orig | Z_EROFS_PAGE_EIO) != orig);
-}
-
-static inline void z_erofs_onlinepage_endio(struct page *page)
-{
- unsigned int v;
-
- DBG_BUGON(!PagePrivate(page));
- v = atomic_dec_return((atomic_t *)&page->private);
- if (!(v & ~Z_EROFS_PAGE_EIO)) {
- set_page_private(page, 0);
- ClearPagePrivate(page);
- if (!(v & Z_EROFS_PAGE_EIO))
- SetPageUptodate(page);
- unlock_page(page);
- }
-}
-
-#define Z_EROFS_ONSTACK_PAGES 32
-
-#endif
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 98fb90b9af71..c8d8e129eb4b 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -4,156 +4,12 @@
* https://www.huawei.com/
*/
#include "internal.h"
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <trace/events/erofs.h>
-static int z_erofs_do_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags);
-
-int z_erofs_fill_inode(struct inode *inode)
-{
- struct erofs_inode *const vi = EROFS_I(inode);
- struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
-
- if (!erofs_sb_has_big_pcluster(sbi) &&
- !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) &&
- vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
- vi->z_advise = 0;
- vi->z_algorithmtype[0] = 0;
- vi->z_algorithmtype[1] = 0;
- vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
- set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
- }
- inode->i_mapping->a_ops = &z_erofs_aops;
- return 0;
-}
-
-static int z_erofs_fill_inode_lazy(struct inode *inode)
-{
- struct erofs_inode *const vi = EROFS_I(inode);
- struct super_block *const sb = inode->i_sb;
- int err, headnr;
- erofs_off_t pos;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
- struct z_erofs_map_header *h;
-
- if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
- /*
- * paired with smp_mb() at the end of the function to ensure
- * fields will only be observed after the bit is set.
- */
- smp_mb();
- return 0;
- }
-
- if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
- return -ERESTARTSYS;
-
- err = 0;
- if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
- goto out_unlock;
-
- pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
- vi->xattr_isize, 8);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
- goto out_unlock;
- }
-
- h = kaddr + erofs_blkoff(pos);
- /*
- * if the highest bit of the 8-byte map header is set, the whole file
- * is stored in the packed inode. The rest bits keeps z_fragmentoff.
- */
- if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
- vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
- vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
- vi->z_tailextent_headlcn = 0;
- goto done;
- }
- vi->z_advise = le16_to_cpu(h->h_advise);
- vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
- vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
-
- headnr = 0;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
- vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
- headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
- err = -EOPNOTSUPP;
- goto out_put_metabuf;
- }
-
- vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
- if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
- vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
- Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
- erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto out_put_metabuf;
- }
- if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
- erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto out_put_metabuf;
- }
-
- if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER
- };
-
- vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- err = z_erofs_do_map_blocks(inode, &map,
- EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
-
- if (!map.m_plen ||
- erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map.m_plen);
- err = -EFSCORRUPTED;
- }
- if (err < 0)
- goto out_put_metabuf;
- }
-
- if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
- !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER
- };
-
- vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
- err = z_erofs_do_map_blocks(inode, &map,
- EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
- if (err < 0)
- goto out_put_metabuf;
- }
-done:
- /* paired with smp_mb() at the beginning of the function */
- smp_mb();
- set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
-out_put_metabuf:
- erofs_put_metabuf(&buf);
-out_unlock:
- clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
- return err;
-}
-
struct z_erofs_maprecorder {
struct inode *inode;
struct erofs_map_blocks *map;
- void *kaddr;
-
unsigned long lcn;
/* compression extent information gathered */
u8 type, headtype;
@@ -161,148 +17,159 @@ struct z_erofs_maprecorder {
u16 delta[2];
erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
- bool partialref;
+ bool partialref, in_mbox;
};
-static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn)
+static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid);
- const erofs_off_t pos =
- Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize +
- vi->xattr_isize) +
- lcn * sizeof(struct z_erofs_vle_decompressed_index);
- struct z_erofs_vle_decompressed_index *di;
- unsigned int advise, type;
-
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(pos), EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
-
- m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
+ const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize) +
+ lcn * sizeof(struct z_erofs_lcluster_index);
+ struct z_erofs_lcluster_index *di;
+ unsigned int advise;
+
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
m->lcn = lcn;
- di = m->kaddr + erofs_blkoff(pos);
+ m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
advise = le16_to_cpu(di->di_advise);
- type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
- ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1);
- switch (type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ m->clusterofs = 1 << vi->z_lclusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
- if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedblks = m->delta[0] &
- ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT;
m->delta[0] = 1;
}
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
- break;
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- if (advise & Z_EROFS_VLE_DI_PARTIAL_REF)
- m->partialref = true;
+ } else {
+ m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
- break;
- default:
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- m->type = type;
return 0;
}
static unsigned int decode_compactedbits(unsigned int lobits,
- unsigned int lomask,
u8 *in, unsigned int pos, u8 *type)
{
const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
- const unsigned int lo = v & lomask;
+ const unsigned int lo = v & ((1 << lobits) - 1);
*type = (v >> lobits) & 3;
return lo;
}
-static int get_compacted_la_distance(unsigned int lclusterbits,
+static int get_compacted_la_distance(unsigned int lobits,
unsigned int encodebits,
unsigned int vcnt, u8 *in, int i)
{
- const unsigned int lomask = (1 << lclusterbits) - 1;
unsigned int lo, d1 = 0;
u8 type;
DBG_BUGON(i >= vcnt);
do {
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
+ lo = decode_compactedbits(lobits, in, encodebits * i, &type);
- if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD)
return d1;
++d1;
} while (++i < vcnt);
- /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */
- if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT))
+ /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */
+ if (!(lo & Z_EROFS_LI_D0_CBLKCNT))
d1 += lo - 1;
return d1;
}
-static int unpack_compacted_index(struct z_erofs_maprecorder *m,
- unsigned int amortizedshift,
- erofs_off_t pos, bool lookahead)
+static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
+ unsigned long lcn, bool lookahead)
{
- struct erofs_inode *const vi = EROFS_I(m->inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
- const unsigned int lomask = (1 << lclusterbits) - 1;
- unsigned int vcnt, base, lo, encodebits, nblk, eofs;
- int i;
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize);
+ const unsigned int lclusterbits = vi->z_lclusterbits;
+ const unsigned int totalidx = erofs_iblks(inode);
+ unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
+ unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
+ bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ erofs_off_t pos;
u8 *in, type;
- bool big_pcluster;
+ int i;
- if (1 << amortizedshift == 4)
+ if (lcn >= totalidx || lclusterbits > 14)
+ return -EINVAL;
+
+ m->lcn = lcn;
+ /* used to align to 32-byte (compacted_2b) alignment */
+ compacted_4b_initial = ((32 - ebase % 32) / 4) & 7;
+ compacted_2b = 0;
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
+ compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+
+ pos = ebase;
+ amortizedshift = 2; /* compact_4b */
+ if (lcn >= compacted_4b_initial) {
+ pos += compacted_4b_initial * 4;
+ lcn -= compacted_4b_initial;
+ if (lcn < compacted_2b) {
+ amortizedshift = 1;
+ } else {
+ pos += compacted_2b * 2;
+ lcn -= compacted_2b;
+ }
+ }
+ pos += lcn * (1 << amortizedshift);
+
+ /* figure out the lcluster count in this pack */
+ if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
- else if (1 << amortizedshift == 2 && lclusterbits == 12)
+ else if (1 << amortizedshift == 2 && lclusterbits <= 12)
vcnt = 16;
else
return -EOPNOTSUPP;
+ in = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
+ if (IS_ERR(in))
+ return PTR_ERR(in);
+
/* it doesn't equal to round_up(..) */
m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
(vcnt << amortizedshift);
- big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
- eofs = erofs_blkoff(pos);
- base = round_down(eofs, vcnt << amortizedshift);
- in = m->kaddr + base;
+ bytes = pos & ((vcnt << amortizedshift) - 1);
+ in -= bytes;
+ i = bytes >> amortizedshift;
- i = (eofs - base) >> amortizedshift;
-
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
+ lo = decode_compactedbits(lobits, in, encodebits * i, &type);
m->type = type;
- if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+ if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
/* figure out lookahead_distance: delta[1] if needed */
if (lookahead)
- m->delta[1] = get_compacted_la_distance(lclusterbits,
+ m->delta[1] = get_compacted_la_distance(lobits,
encodebits, vcnt, in, i);
- if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ if (lo & Z_EROFS_LI_D0_CBLKCNT) {
if (!big_pcluster) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT;
m->delta[0] = 1;
return 0;
} else if (i + 1 != (int)vcnt) {
@@ -314,11 +181,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
* of which lo saves delta[1] rather than delta[0].
* Hence, get delta[0] by the previous lcluster indirectly.
*/
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * (i - 1), &type);
- if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ lo = decode_compactedbits(lobits, in,
+ encodebits * (i - 1), &type);
+ if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD)
lo = 0;
- else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+ else if (lo & Z_EROFS_LI_D0_CBLKCNT)
lo = 1;
m->delta[0] = lo + 1;
return 0;
@@ -330,9 +197,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
nblk = 1;
while (i > 0) {
--i;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
- if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ lo = decode_compactedbits(lobits, in,
+ encodebits * i, &type);
+ if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD)
i -= lo;
if (i >= 0)
@@ -342,12 +209,12 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
nblk = 0;
while (i > 0) {
--i;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
- if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
- if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ lo = decode_compactedbits(lobits, in,
+ encodebits * i, &type);
+ if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (lo & Z_EROFS_LI_D0_CBLKCNT) {
--i;
- nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT;
continue;
}
/* bigpcluster shouldn't have plain d0 == 1 */
@@ -366,119 +233,61 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
return 0;
}
-static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn, bool lookahead)
+static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned int lcn, bool lookahead)
{
- struct inode *const inode = m->inode;
- struct erofs_inode *const vi = EROFS_I(inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
- const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
- vi->inode_isize + vi->xattr_isize, 8) +
- sizeof(struct z_erofs_map_header);
- const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
- unsigned int compacted_4b_initial, compacted_2b;
- unsigned int amortizedshift;
- erofs_off_t pos;
-
- if (lclusterbits != 12)
- return -EOPNOTSUPP;
-
- if (lcn >= totalidx)
- return -EINVAL;
-
- m->lcn = lcn;
- /* used to align to 32-byte (compacted_2b) alignment */
- compacted_4b_initial = (32 - ebase % 32) / 4;
- if (compacted_4b_initial == 32 / 4)
- compacted_4b_initial = 0;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
- compacted_4b_initial < totalidx)
- compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
- else
- compacted_2b = 0;
+ struct erofs_inode *vi = EROFS_I(m->inode);
+ int err;
- pos = ebase;
- if (lcn < compacted_4b_initial) {
- amortizedshift = 2;
- goto out;
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) {
+ err = z_erofs_load_compact_lcluster(m, lcn, lookahead);
+ } else {
+ DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL);
+ err = z_erofs_load_full_lcluster(m, lcn);
}
- pos += compacted_4b_initial * 4;
- lcn -= compacted_4b_initial;
+ if (err)
+ return err;
- if (lcn < compacted_2b) {
- amortizedshift = 1;
- goto out;
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu",
+ m->type, lcn, EROFS_I(m->inode)->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ } else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD &&
+ m->clusterofs >= (1 << vi->z_lclusterbits)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
- pos += compacted_2b * 2;
- lcn -= compacted_2b;
- amortizedshift = 2;
-out:
- pos += lcn * (1 << amortizedshift);
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(pos), EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
- return unpack_compacted_index(m, amortizedshift, pos, lookahead);
-}
-
-static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned int lcn, bool lookahead)
-{
- const unsigned int datamode = EROFS_I(m->inode)->datalayout;
-
- if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
- return legacy_load_cluster_from_disk(m, lcn);
-
- if (datamode == EROFS_INODE_FLAT_COMPRESSION)
- return compacted_load_cluster_from_disk(m, lcn, lookahead);
-
- return -EINVAL;
+ return 0;
}
static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
+ struct super_block *sb = m->inode->i_sb;
struct erofs_inode *const vi = EROFS_I(m->inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const unsigned int lclusterbits = vi->z_lclusterbits;
while (m->lcn >= lookback_distance) {
unsigned long lcn = m->lcn - lookback_distance;
int err;
- /* load extent head logical cluster if needed */
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ if (!lookback_distance)
+ break;
+
+ err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
-
- switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
- erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
continue;
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- m->headtype = m->type;
- m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
- return 0;
- default:
- erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
}
-
- erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
- vi->nid);
+ erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
+ lookback_distance, m->lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -486,30 +295,27 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
unsigned int initial_lcn)
{
- struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn;
+ struct inode *inode = m->inode;
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2;
+ unsigned long lcn = m->lcn + 1;
int err;
- DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
DBG_BUGON(m->type != m->headtype);
- if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
- ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
- ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1ULL << lclusterbits;
- return 0;
- }
- lcn = m->lcn + 1;
+ if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
+ ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
+ m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
+ (lcn << vi->z_lclusterbits) >= inode->i_size)
+ m->compressedblks = 1;
+
if (m->compressedblks)
goto out;
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ err = z_erofs_load_lcluster_from_disk(m, lcn, false);
if (err)
return err;
@@ -522,40 +328,23 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
* BUG_ON in the debugging mode only for developers to notice that.
*/
DBG_BUGON(lcn == initial_lcn &&
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD);
+ m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- /*
- * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
- */
- m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE);
- break;
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (m->delta[0] != 1)
- goto err_bonus_cblkcnt;
- if (m->compressedblks)
- break;
- fallthrough;
- default:
- erofs_err(m->inode->i_sb,
- "cannot found CBLKCNT @ lcn %lu of nid %llu",
- lcn, vi->nid);
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD && m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
+
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type rather
+ * than CBLKCNT, it's a 1 block-sized pcluster.
+ */
+ if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD || !m->compressedblks)
+ m->compressedblks = 1;
out:
- map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE;
+ m->map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
-err_bonus_cblkcnt:
- erofs_err(m->inode->i_sb,
- "bogus CBLKCNT @ lcn %lu of nid %llu",
- lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
}
static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
@@ -563,115 +352,96 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
struct inode *inode = m->inode;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_map_blocks *map = m->map;
- unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned int lclusterbits = vi->z_lclusterbits;
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
- do {
+ while (1) {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
return 0;
}
- err = z_erofs_load_cluster_from_disk(m, lcn, true);
+ err = z_erofs_load_lcluster_from_disk(m, lcn, true);
if (err)
return err;
- if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
- DBG_BUGON(!m->delta[1] &&
- m->clusterofs != 1 << lclusterbits);
- } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
- /* go on until the next HEAD lcluster */
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ /* work around invalid d1 generated by pre-1.0 mkfs */
+ if (unlikely(!m->delta[1])) {
+ m->delta[1] = 1;
+ DBG_BUGON(1);
+ }
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
if (lcn != headlcn)
- break;
+ break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
- } else {
- erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
lcn += m->delta[1];
- } while (m->delta[1]);
-
+ }
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}
-static int z_erofs_do_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags)
+static int z_erofs_map_blocks_fo(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
{
- struct erofs_inode *const vi = EROFS_I(inode);
- bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ bool ztailpacking = vi->z_idata_size;
+ unsigned int lclusterbits = vi->z_lclusterbits;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
+ .in_mbox = erofs_inode_in_metabox(inode),
};
- int err = 0;
- unsigned int lclusterbits, endoff;
+ unsigned int endoff;
unsigned long initial_lcn;
unsigned long long ofs, end;
+ int err;
- lclusterbits = vi->z_logical_clusterbits;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
+ if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_FRAGMENT;
+ return 0;
+ }
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false);
+ err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false);
if (err)
goto unmap_out;
- if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
- vi->z_idataoff = m.nextpackoff;
-
+ if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking)
+ vi->z_fragmentoff = m.nextpackoff;
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
- switch (m.type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- if (endoff >= m.clusterofs) {
- m.headtype = m.type;
- map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
- /*
- * For ztailpacking files, in order to inline data more
- * effectively, special EOF lclusters are now supported
- * which can have three parts at most.
- */
- if (ztailpacking && end > inode->i_size)
- end = inode->i_size;
- break;
- }
- /* m.lcn should be >= 1 if endoff < m.clusterofs */
- if (!m.lcn) {
- erofs_err(inode->i_sb,
- "invalid logical cluster 0 at nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && endoff >= m.clusterofs) {
+ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
+ } else {
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ end = (m.lcn << lclusterbits) | m.clusterofs;
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ m.delta[0] = 1;
}
- end = (m.lcn << lclusterbits) | m.clusterofs;
- map->m_flags |= EROFS_MAP_FULL_MAPPED;
- m.delta[0] = 1;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
/* get the corresponding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
- break;
- default:
- erofs_err(inode->i_sb,
- "unknown type %u @ offset %llu of nid %llu",
- m.type, ofs, vi->nid);
- err = -EOPNOTSUPP;
- goto unmap_out;
}
if (m.partialref)
map->m_flags |= EROFS_MAP_PARTIAL_REF;
@@ -680,36 +450,39 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
vi->z_tailextent_headlcn = m.lcn;
/* for non-compact indexes, fragmentoff is 64 bits */
- if (fragment &&
- vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+ if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL)
vi->z_fragmentoff |= (u64)m.pblk << 32;
}
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
- map->m_pa = vi->z_idataoff;
+ map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
+ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+ erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
- map->m_flags |= EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
} else {
- map->m_pa = blknr_to_addr(m.pblk);
+ map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto unmap_out;
}
- if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+ if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) {
if (map->m_llen > map->m_plen) {
DBG_BUGON(1);
err = -EFSCORRUPTED;
goto unmap_out;
}
if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_INTERLACED;
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
else
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_SHIFTED;
- } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
map->m_algorithmformat = vi->z_algorithmtype[1];
} else {
map->m_algorithmformat = vi->z_algorithmtype[0];
@@ -717,8 +490,10 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
- map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
- map->m_llen >= EROFS_BLKSIZ)) {
+ (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) &&
+ map->m_llen >= i_blocksize(inode))) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
@@ -726,47 +501,267 @@ static int z_erofs_do_map_blocks(struct inode *inode,
unmap_out:
erofs_unmap_metabuf(&m.map->buf);
- erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
- __func__, map->m_la, map->m_pa,
- map->m_llen, map->m_plen, map->m_flags);
return err;
}
-int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
- int flags)
+static int z_erofs_map_blocks_ext(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER;
+ unsigned int recsz = z_erofs_extent_recsize(vi->z_advise);
+ erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize), recsz);
+ bool in_mbox = erofs_inode_in_metabox(inode);
+ erofs_off_t lend = inode->i_size;
+ erofs_off_t l, r, mid, pa, la, lstart;
+ struct z_erofs_extent *ext;
+ unsigned int fmt;
+ bool last;
+
+ map->m_flags = 0;
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) {
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ pa = le64_to_cpu(*(__le64 *)ext);
+ pos += sizeof(__le64);
+ lstart = 0;
+ } else {
+ lstart = round_down(map->m_la, 1 << vi->z_lclusterbits);
+ pos += (lstart >> vi->z_lclusterbits) * recsz;
+ pa = EROFS_NULL_ADDR;
+ }
+
+ for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ map->m_plen = le32_to_cpu(ext->plen);
+ if (pa != EROFS_NULL_ADDR) {
+ map->m_pa = pa;
+ pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK;
+ } else {
+ map->m_pa = le32_to_cpu(ext->pstart_lo);
+ }
+ pos += recsz;
+ }
+ last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits));
+ lend = min(lstart, lend);
+ lstart -= 1 << vi->z_lclusterbits;
+ } else {
+ lstart = lend;
+ for (l = 0, r = vi->z_extents; l < r; ) {
+ mid = l + (r - l) / 2;
+ ext = erofs_read_metabuf(&map->buf, sb,
+ pos + mid * recsz, in_mbox);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+
+ la = le32_to_cpu(ext->lstart_lo);
+ pa = le32_to_cpu(ext->pstart_lo) |
+ (u64)le32_to_cpu(ext->pstart_hi) << 32;
+ if (recsz > offsetof(struct z_erofs_extent, lstart_hi))
+ la |= (u64)le32_to_cpu(ext->lstart_hi) << 32;
+
+ if (la > map->m_la) {
+ r = mid;
+ if (la > lend) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lend = la;
+ } else {
+ l = mid + 1;
+ if (map->m_la == la)
+ r = min(l + 1, r);
+ lstart = la;
+ map->m_plen = le32_to_cpu(ext->plen);
+ map->m_pa = pa;
+ }
+ }
+ last = (l >= vi->z_extents);
+ }
+
+ if (lstart < lend) {
+ map->m_la = lstart;
+ if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
+ map->m_flags = EROFS_MAP_FRAGMENT;
+ vi->z_fragmentoff = map->m_plen;
+ if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
+ vi->z_fragmentoff |= map->m_pa << 32;
+ } else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) {
+ map->m_flags |= EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
+ fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
+ if (fmt)
+ map->m_algorithmformat = fmt - 1;
+ else if (interlaced && !erofs_blkoff(sb, map->m_pa))
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
+ map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK;
+ }
+ }
+ map->m_llen = lend - map->m_la;
+ return 0;
+}
+
+static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = inode->i_sb;
+ struct z_erofs_map_header *h;
+ erofs_off_t pos;
int err = 0;
- trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
+ return 0;
+ }
+
+ if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
+ return -ERESTARTSYS;
- /* when trying to read beyond EOF, leave it unmapped */
- if (map->m_la >= inode->i_size) {
- map->m_llen = map->m_la + 1 - inode->i_size;
- map->m_la = inode->i_size;
- map->m_flags = 0;
- goto out;
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ goto out_unlock;
+
+ pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
+ h = erofs_read_metabuf(&map->buf, sb, pos, erofs_inode_in_metabox(inode));
+ if (IS_ERR(h)) {
+ err = PTR_ERR(h);
+ goto out_unlock;
}
- err = z_erofs_fill_inode_lazy(inode);
- if (err)
- goto out;
+ /*
+ * if the highest bit of the 8-byte map header is set, the whole file
+ * is stored in the packed inode. The rest bits keeps z_fragmentoff.
+ */
+ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
+ vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
+ vi->z_tailextent_headlcn = 0;
+ goto done;
+ }
+ vi->z_advise = le16_to_cpu(h->h_advise);
+ vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15);
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) {
+ vi->z_extents = le32_to_cpu(h->h_extents_lo) |
+ ((u64)le16_to_cpu(h->h_extents_hi) << 32);
+ goto done;
+ }
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
- EROFS_MAP_FRAGMENT;
- goto out;
+ vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+ vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
+
+ if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_unlock;
}
- err = z_erofs_do_map_blocks(inode, map, flags);
-out:
- trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+ if (vi->z_idata_size ||
+ (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
+ struct erofs_map_blocks tm = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
- /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
- DBG_BUGON(err < 0 && err != -ENOMEM);
+ err = z_erofs_map_blocks_fo(inode, &tm,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&tm.buf);
+ if (err < 0)
+ goto out_unlock;
+ }
+done:
+ /* paired with smp_mb() at the beginning of the function */
+ smp_mb();
+ set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+out_unlock:
+ clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
+ return err;
+}
+
+static int z_erofs_map_sanity_check(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ u64 pend;
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED))
+ return 0;
+ if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+ erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+ map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+ !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ map->m_algorithmformat, EROFS_I(inode)->nid);
+ return -EFSCORRUPTED;
+ }
+ if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ return -EOPNOTSUPP;
+ /* Filesystems beyond 48-bit physical block addresses are invalid */
+ if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) ||
+ (pend >> sbi->blkszbits) >= BIT_ULL(48)))
+ return -EFSCORRUPTED;
+ return 0;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
+ int flags)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ int err = 0;
+
+ trace_erofs_map_blocks_enter(inode, map, flags);
+ if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */
+ map->m_llen = map->m_la + 1 - inode->i_size;
+ map->m_la = inode->i_size;
+ map->m_flags = 0;
+ } else {
+ err = z_erofs_fill_inode(inode, map);
+ if (!err) {
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS))
+ err = z_erofs_map_blocks_ext(inode, map, flags);
+ else
+ err = z_erofs_map_blocks_fo(inode, map, flags);
+ }
+ if (!err)
+ err = z_erofs_map_sanity_check(inode, map);
+ if (err)
+ map->m_llen = 0;
+ }
+ trace_erofs_map_blocks_exit(inode, map, flags, err);
return err;
}
@@ -787,7 +782,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ?
IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
new file mode 100644
index 000000000000..55ff2ab5128e
--- /dev/null
+++ b/fs/erofs/zutil.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
+ */
+#include "internal.h"
+
+struct z_erofs_gbuf {
+ spinlock_t lock;
+ void *ptr;
+ struct page **pages;
+ unsigned int nrpages;
+};
+
+static struct z_erofs_gbuf *z_erofs_gbufpool, *z_erofs_rsvbuf;
+static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
+ z_erofs_rsv_nrpages;
+
+module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
+module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
+
+atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
+
+/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+static unsigned int shrinker_run_no;
+static struct shrinker *erofs_shrinker_info;
+
+static unsigned int z_erofs_gbuf_id(void)
+{
+ return raw_smp_processor_id() % z_erofs_gbuf_count;
+}
+
+void *z_erofs_get_gbuf(unsigned int requiredpages)
+ __acquires(gbuf->lock)
+{
+ struct z_erofs_gbuf *gbuf;
+
+ migrate_disable();
+ gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+ spin_lock(&gbuf->lock);
+ /* check if the buffer is too small */
+ if (requiredpages > gbuf->nrpages) {
+ spin_unlock(&gbuf->lock);
+ migrate_enable();
+ /* (for sparse checker) pretend gbuf->lock is still taken */
+ __acquire(gbuf->lock);
+ return NULL;
+ }
+ return gbuf->ptr;
+}
+
+void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
+{
+ struct z_erofs_gbuf *gbuf;
+
+ gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
+ DBG_BUGON(gbuf->ptr != ptr);
+ spin_unlock(&gbuf->lock);
+ migrate_enable();
+}
+
+int z_erofs_gbuf_growsize(unsigned int nrpages)
+{
+ static DEFINE_MUTEX(gbuf_resize_mutex);
+ struct page **tmp_pages = NULL;
+ struct z_erofs_gbuf *gbuf;
+ void *ptr, *old_ptr;
+ int last, i, j;
+
+ mutex_lock(&gbuf_resize_mutex);
+ /* avoid shrinking gbufs, since no idea how many fses rely on */
+ if (nrpages <= z_erofs_gbuf_nrpages) {
+ mutex_unlock(&gbuf_resize_mutex);
+ return 0;
+ }
+
+ for (i = 0; i < z_erofs_gbuf_count; ++i) {
+ gbuf = &z_erofs_gbufpool[i];
+ tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL);
+ if (!tmp_pages)
+ goto out;
+
+ for (j = 0; j < gbuf->nrpages; ++j)
+ tmp_pages[j] = gbuf->pages[j];
+ do {
+ last = j;
+ j = alloc_pages_bulk(GFP_KERNEL, nrpages,
+ tmp_pages);
+ if (last == j)
+ goto out;
+ } while (j != nrpages);
+
+ ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
+ if (!ptr)
+ goto out;
+
+ spin_lock(&gbuf->lock);
+ kfree(gbuf->pages);
+ gbuf->pages = tmp_pages;
+ old_ptr = gbuf->ptr;
+ gbuf->ptr = ptr;
+ gbuf->nrpages = nrpages;
+ spin_unlock(&gbuf->lock);
+ if (old_ptr)
+ vunmap(old_ptr);
+ }
+ z_erofs_gbuf_nrpages = nrpages;
+out:
+ if (i < z_erofs_gbuf_count && tmp_pages) {
+ for (j = 0; j < nrpages; ++j)
+ if (tmp_pages[j] && (j >= gbuf->nrpages ||
+ tmp_pages[j] != gbuf->pages[j]))
+ __free_page(tmp_pages[j]);
+ kfree(tmp_pages);
+ }
+ mutex_unlock(&gbuf_resize_mutex);
+ return i < z_erofs_gbuf_count ? -ENOMEM : 0;
+}
+
+int __init z_erofs_gbuf_init(void)
+{
+ unsigned int i, total = num_possible_cpus();
+
+ if (z_erofs_gbuf_count)
+ total = min(z_erofs_gbuf_count, total);
+ z_erofs_gbuf_count = total;
+
+ /* The last (special) global buffer is the reserved buffer */
+ total += !!z_erofs_rsv_nrpages;
+
+ z_erofs_gbufpool = kcalloc(total, sizeof(*z_erofs_gbufpool),
+ GFP_KERNEL);
+ if (!z_erofs_gbufpool)
+ return -ENOMEM;
+
+ if (z_erofs_rsv_nrpages) {
+ z_erofs_rsvbuf = &z_erofs_gbufpool[total - 1];
+ z_erofs_rsvbuf->pages = kcalloc(z_erofs_rsv_nrpages,
+ sizeof(*z_erofs_rsvbuf->pages), GFP_KERNEL);
+ if (!z_erofs_rsvbuf->pages) {
+ z_erofs_rsvbuf = NULL;
+ z_erofs_rsv_nrpages = 0;
+ }
+ }
+ for (i = 0; i < total; ++i)
+ spin_lock_init(&z_erofs_gbufpool[i].lock);
+ return 0;
+}
+
+void z_erofs_gbuf_exit(void)
+{
+ int i, j;
+
+ for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
+ struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
+
+ if (gbuf->ptr) {
+ vunmap(gbuf->ptr);
+ gbuf->ptr = NULL;
+ }
+
+ if (!gbuf->pages)
+ continue;
+
+ for (j = 0; j < gbuf->nrpages; ++j)
+ if (gbuf->pages[j])
+ put_page(gbuf->pages[j]);
+ kfree(gbuf->pages);
+ gbuf->pages = NULL;
+ }
+ kfree(z_erofs_gbufpool);
+}
+
+struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv)
+{
+ struct page *page = *pagepool;
+
+ if (page) {
+ *pagepool = (struct page *)page_private(page);
+ } else if (tryrsv && z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages) {
+ spin_lock(&z_erofs_rsvbuf->lock);
+ if (z_erofs_rsvbuf->nrpages)
+ page = z_erofs_rsvbuf->pages[--z_erofs_rsvbuf->nrpages];
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ }
+ if (!page)
+ page = alloc_page(gfp);
+ DBG_BUGON(page && page_ref_count(page) != 1);
+ return page;
+}
+
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ /* try to fill reserved global pool first */
+ if (z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages <
+ z_erofs_rsv_nrpages) {
+ spin_lock(&z_erofs_rsvbuf->lock);
+ if (z_erofs_rsvbuf->nrpages < z_erofs_rsv_nrpages) {
+ z_erofs_rsvbuf->pages[z_erofs_rsvbuf->nrpages++]
+ = page;
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ continue;
+ }
+ spin_unlock(&z_erofs_rsvbuf->lock);
+ }
+ put_page(page);
+ }
+}
+
+void erofs_shrinker_register(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_init(&sbi->umount_mutex);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_add(&sbi->list, &erofs_sb_list);
+ spin_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ mutex_lock(&sbi->umount_mutex);
+ while (!xa_empty(&sbi->managed_pslots)) {
+ z_erofs_shrink_scan(sbi, ~0UL);
+ cond_resched();
+ }
+ spin_lock(&erofs_sb_list_lock);
+ list_del(&sbi->list);
+ spin_unlock(&erofs_sb_list_lock);
+ mutex_unlock(&sbi->umount_mutex);
+}
+
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return atomic_long_read(&erofs_global_shrink_cnt) ?: SHRINK_EMPTY;
+}
+
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct erofs_sb_info *sbi;
+ struct list_head *p;
+
+ unsigned long nr = sc->nr_to_scan;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&erofs_sb_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+
+ /* Iterate over all mounted superblocks and try to shrink them */
+ p = erofs_sb_list.next;
+ while (p != &erofs_sb_list) {
+ sbi = list_entry(p, struct erofs_sb_info, list);
+
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+
+ spin_unlock(&erofs_sb_list_lock);
+ sbi->shrinker_run_no = run_no;
+ freed += z_erofs_shrink_scan(sbi, nr - freed);
+ spin_lock(&erofs_sb_list_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_move_tail(&sbi->list, &erofs_sb_list);
+ mutex_unlock(&sbi->umount_mutex);
+
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&erofs_sb_list_lock);
+ return freed;
+}
+
+int __init erofs_init_shrinker(void)
+{
+ erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
+ if (!erofs_shrinker_info)
+ return -ENOMEM;
+
+ erofs_shrinker_info->count_objects = erofs_shrink_count;
+ erofs_shrinker_info->scan_objects = erofs_shrink_scan;
+ shrinker_register(erofs_shrinker_info);
+ return 0;
+}
+
+void erofs_exit_shrinker(void)
+{
+ shrinker_free(erofs_shrinker_info);
+}