diff options
Diffstat (limited to 'fs/erofs')
| -rw-r--r-- | fs/erofs/Kconfig | 124 | ||||
| -rw-r--r-- | fs/erofs/Makefile | 8 | ||||
| -rw-r--r-- | fs/erofs/compress.h | 103 | ||||
| -rw-r--r-- | fs/erofs/data.c | 444 | ||||
| -rw-r--r-- | fs/erofs/decompressor.c | 498 | ||||
| -rw-r--r-- | fs/erofs/decompressor_crypto.c | 182 | ||||
| -rw-r--r-- | fs/erofs/decompressor_deflate.c | 204 | ||||
| -rw-r--r-- | fs/erofs/decompressor_lzma.c | 170 | ||||
| -rw-r--r-- | fs/erofs/decompressor_zstd.c | 220 | ||||
| -rw-r--r-- | fs/erofs/dir.c | 99 | ||||
| -rw-r--r-- | fs/erofs/erofs_fs.h | 380 | ||||
| -rw-r--r-- | fs/erofs/fileio.c | 191 | ||||
| -rw-r--r-- | fs/erofs/fscache.c | 464 | ||||
| -rw-r--r-- | fs/erofs/inode.c | 449 | ||||
| -rw-r--r-- | fs/erofs/internal.h | 479 | ||||
| -rw-r--r-- | fs/erofs/namei.c | 71 | ||||
| -rw-r--r-- | fs/erofs/pcpubuf.c | 148 | ||||
| -rw-r--r-- | fs/erofs/super.c | 868 | ||||
| -rw-r--r-- | fs/erofs/sysfs.c | 126 | ||||
| -rw-r--r-- | fs/erofs/tagptr.h | 107 | ||||
| -rw-r--r-- | fs/erofs/utils.c | 292 | ||||
| -rw-r--r-- | fs/erofs/xattr.c | 760 | ||||
| -rw-r--r-- | fs/erofs/xattr.h | 54 | ||||
| -rw-r--r-- | fs/erofs/zdata.c | 1922 | ||||
| -rw-r--r-- | fs/erofs/zdata.h | 178 | ||||
| -rw-r--r-- | fs/erofs/zmap.c | 931 | ||||
| -rw-r--r-- | fs/erofs/zutil.c | 317 |
27 files changed, 5225 insertions, 4564 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 85490370e0ca..d81f3318417d 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,8 +3,18 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select CACHEFILES if EROFS_FS_ONDEMAND + select CRC32 + select CRYPTO if EROFS_FS_ZIP_ACCEL + select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL select FS_IOMAP - select LIBCRC32C + select LZ4_DECOMPRESS if EROFS_FS_ZIP + select NETFS_SUPPORT if EROFS_FS_ONDEMAND + select XXHASH if EROFS_FS_XATTR + select XZ_DEC if EROFS_FS_ZIP_LZMA + select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA + select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE + select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD help EROFS (Enhanced Read-Only File System) is a lightweight read-only file system with modern designs (e.g. no buffer heads, inline @@ -13,15 +23,15 @@ config EROFS_FS smartphones with Android OS, LiveCDs and high-density hosts with numerous containers; - It also provides fixed-sized output compression support in order to - improve storage density as well as keep relatively higher compression - ratios and implements in-place decompression to reuse the file page - for compressed data temporarily with proper strategies, which is - quite useful to ensure guaranteed end-to-end runtime decompression - performance under extremely memory pressure without extra cost. + It also provides transparent compression and deduplication support to + improve storage density and maintain relatively high compression + ratios, and it implements in-place decompression to temporarily reuse + page cache for compressed data using proper strategies, which is + quite useful for ensuring guaranteed end-to-end runtime decompression + performance under extreme memory pressure without extra cost. See the documentation at <file:Documentation/filesystems/erofs.rst> - for more details. + and the web pages at <https://erofs.docs.kernel.org> for more details. If unsure, say N. @@ -73,38 +83,114 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_ZIP bool "EROFS Data Compression Support" depends on EROFS_FS - select LZ4_DECOMPRESS default y help - Enable fixed-sized output compression for EROFS. + Enable transparent compression support for EROFS file systems. If you don't want to enable compression feature, say N. config EROFS_FS_ZIP_LZMA bool "EROFS LZMA compressed data support" depends on EROFS_FS_ZIP - select XZ_DEC - select XZ_DEC_MICROLZMA help Saying Y here includes support for reading EROFS file systems - containing LZMA compressed data, specifically called microLZMA. it - gives better compression ratios than the LZ4 algorithm, at the + containing LZMA compressed data, specifically called microLZMA. It + gives better compression ratios than the default LZ4 format, at the expense of more CPU overhead. - LZMA support is an experimental feature for now and so most file - systems will be readable without selecting this option. + If unsure, say N. + +config EROFS_FS_ZIP_DEFLATE + bool "EROFS DEFLATE compressed data support" + depends on EROFS_FS_ZIP + help + Saying Y here includes support for reading EROFS file systems + containing DEFLATE compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + DEFLATE support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config EROFS_FS_ZIP_ZSTD + bool "EROFS Zstandard compressed data support" + depends on EROFS_FS_ZIP + help + Saying Y here includes support for reading EROFS file systems + containing Zstandard compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + Zstandard support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config EROFS_FS_ZIP_ACCEL + bool "EROFS hardware decompression support" + depends on EROFS_FS_ZIP + help + Saying Y here includes hardware accelerator support for reading + EROFS file systems containing compressed data. It gives better + decompression speed than the software-implemented decompression, and + it costs lower CPU overhead. + + Hardware accelerator support is an experimental feature for now and + file systems are still readable without selecting this option. If unsure, say N. config EROFS_FS_ONDEMAND - bool "EROFS fscache-based on-demand read support" - depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) - default n + bool "EROFS fscache-based on-demand read support (deprecated)" + depends on EROFS_FS + select FSCACHE + select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand read support. + It is now deprecated and scheduled to be removed from the kernel + after fanotify pre-content hooks are landed. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD + bool "EROFS per-cpu decompression kthread workers" + depends on EROFS_FS_ZIP + help + Saying Y here enables per-CPU kthread workers pool to carry out + async decompression for low latencies on some architectures. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD_HIPRI + bool "EROFS high priority per-CPU kthread workers" + depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD + default y + help + This permits EROFS to configure per-CPU kthread workers to run + at higher priority. + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 99bbc597a3e9..549abc424763 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,8 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o +erofs-objs := super.o inode.o data.o namei.o dir.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o -erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o +erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o +erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o +erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..84c8e52581f4 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,57 +11,37 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - + unsigned int inpages, outpages; unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { - int (*decompress)(struct z_erofs_decompress_req *rq, - struct page **pagepool); + int (*config)(struct super_block *sb, struct erofs_super_block *dsb, + void *data, int size); + const char *(*decompress)(struct z_erofs_decompress_req *rq, + struct page **pagepool); + int (*init)(void); + void (*exit)(void); char *name; }; -/* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) -#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) - -/* - * For all pages in a pcluster, page->private should be one of - * Type Last 2bits page->private - * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE - * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE - * cached/managed page 00 pointer to z_erofs_pcluster - * online page (file-backed, 01/10/11 sub-index << 2 | count - * some pages can be used for inplace I/O) - * - * page->mapping should be one of - * Type page->mapping - * short-lived page NULL - * preallocated page NULL - * cached/managed page non-NULL or NULL (invalidated/truncated page) - * online page non-NULL - * - * For all managed pages, PG_private should be set with 1 extra refcount, - * which is used for page reclaim / migration. - */ +#define Z_EROFS_PREALLOCATED_FOLIO ((void *)(-2UL << 2)) /* - * short-lived pages are pages directly from buddy system with specific - * page->private (no need to set PagePrivate since these are non-LRU / - * non-movable pages and bypass reclaim / migration code). + * Currently, short-lived pages are pages directly from buddy system + * with specific page->private (Z_EROFS_SHORTLIVED_PAGE). + * In the future world of Memdescs, it should be type 0 (Misc) memory + * which type can be checked with a new helper. */ static inline bool z_erofs_is_shortlived_page(struct page *page) { - if (page->private != Z_EROFS_SHORTLIVED_PAGE) - return false; - - DBG_BUGON(page->mapping); - return true; + return page->private == Z_EROFS_SHORTLIVED_PAGE; } static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, @@ -69,30 +49,41 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, { if (!z_erofs_is_shortlived_page(page)) return false; - - /* short-lived pages should not be used by others at the same time */ - if (page_ref_count(page) > 1) { - put_page(page); - } else { - /* follow the pcluster rule above. */ - erofs_pagepool_add(pagepool, page); - } + erofs_pagepool_add(pagepool, page); return true; } -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} +extern const struct z_erofs_decompressor z_erofs_lzma_decomp; +extern const struct z_erofs_decompressor z_erofs_deflate_decomp; +extern const struct z_erofs_decompressor z_erofs_zstd_decomp; +extern const struct z_erofs_decompressor *z_erofs_decomp[]; -int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, - unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +struct z_erofs_stream_dctx { + struct z_erofs_decompress_req *rq; + int no, ni; /* the current {en,de}coded page # */ -/* prototypes for specific algorithms */ -int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); + unsigned int avail_out; /* remaining bytes in the decoded buffer */ + unsigned int inbuf_pos, inbuf_sz; + /* current status of the encoded buffer */ + u8 *kin, *kout; /* buffer mapped pointers */ + void *bounce; /* bounce buffer for inplace I/Os */ + bool bounced; /* is the bounce buffer used now? */ +}; + +const char *z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, + void **dst, void **src, struct page **pgpl); +const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, + const char *padbuf, unsigned int padbufsize); +int __init z_erofs_init_decompressor(void); +void z_erofs_exit_decompressor(void); +int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl); +int z_erofs_crypto_enable_engine(const char *name, int len); +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL +void z_erofs_crypto_disable_all_engines(void); +int z_erofs_crypto_show_engines(char *buf, int size, char sep); +#else +static inline void z_erofs_crypto_disable_all_engines(void) {} +static inline int z_erofs_crypto_show_engines(char *buf, int size, char sep) { return 0; } +#endif #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..bb13c4cb8455 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -5,17 +5,15 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" -#include <linux/prefetch.h> #include <linux/sched/mm.h> -#include <linux/dax.h> #include <trace/events/erofs.h> void erofs_unmap_metabuf(struct erofs_buf *buf) { - if (buf->kmap_type == EROFS_KMAP) - kunmap_local(buf->base); + if (!buf->base) + return; + kunmap_local(buf->base); buf->base = NULL; - buf->kmap_type = EROFS_NO_KMAP; } void erofs_put_metabuf(struct erofs_buf *buf) @@ -23,124 +21,103 @@ void erofs_put_metabuf(struct erofs_buf *buf) if (!buf->page) return; erofs_unmap_metabuf(buf); - put_page(buf->page); + folio_put(page_folio(buf->page)); buf->page = NULL; } -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap) { - struct address_space *const mapping = inode->i_mapping; - erofs_off_t offset = blknr_to_addr(blkaddr); - pgoff_t index = offset >> PAGE_SHIFT; - struct page *page = buf->page; - struct folio *folio; - unsigned int nofs_flag; - - if (!page || page->index != index) { - erofs_put_metabuf(buf); + pgoff_t index = (buf->off + offset) >> PAGE_SHIFT; + struct folio *folio = NULL; - nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(mapping, index, NULL, NULL); - memalloc_nofs_restore(nofs_flag); + if (buf->page) { + folio = page_folio(buf->page); + if (folio_file_page(folio, index) != buf->page) + erofs_unmap_metabuf(buf); + } + if (!folio || !folio_contains(folio, index)) { + erofs_put_metabuf(buf); + folio = read_mapping_folio(buf->mapping, index, buf->file); if (IS_ERR(folio)) return folio; - - /* should already be PageUptodate, no need to lock page */ - page = folio_file_page(folio, index); - buf->page = page; - } - if (buf->kmap_type == EROFS_NO_KMAP) { - if (type == EROFS_KMAP) - buf->base = kmap_local_page(page); - buf->kmap_type = type; - } else if (buf->kmap_type != type) { - DBG_BUGON(1); - return ERR_PTR(-EFAULT); } - if (type == EROFS_NO_KMAP) + buf->page = folio_file_page(folio, index); + if (!need_kmap) return NULL; + if (!buf->base) + buf->base = kmap_local_page(buf->page); return buf->base + (offset & ~PAGE_MASK); } -void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb, + bool in_metabox) { - if (erofs_is_fscache_mode(sb)) - return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode, - blkaddr, type); + struct erofs_sb_info *sbi = EROFS_SB(sb); - return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); + buf->file = NULL; + if (in_metabox) { + if (unlikely(!sbi->metabox_inode)) + return -EFSCORRUPTED; + buf->mapping = sbi->metabox_inode->i_mapping; + return 0; + } + buf->off = sbi->dif0.fsoff; + if (erofs_is_fileio_mode(sbi)) { + buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ + buf->mapping = buf->file->f_mapping; + } else if (erofs_is_fscache_mode(sb)) + buf->mapping = sbi->dif0.fscache->inode->i_mapping; + else + buf->mapping = sb->s_bdev->bd_mapping; + return 0; } -static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map, - int flags) +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_off_t offset, bool in_metabox) { - erofs_blk_t nblocks, lastblk; - u64 offset = map->m_la; - struct erofs_inode *vi = EROFS_I(inode); - bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - - nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); - lastblk = nblocks - tailendpacking; - - /* there is no hole in flatmode */ - map->m_flags = EROFS_MAP_MAPPED; - if (offset < blknr_to_addr(lastblk)) { - map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; - map->m_plen = blknr_to_addr(lastblk) - offset; - } else if (tailendpacking) { - /* 2 - inode inline B: inode, [xattrs], inline last blk... */ - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(map->m_la); - map->m_plen = inode->i_size - offset; + int err; - /* inline data should be located in the same meta block */ - if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) { - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - map->m_flags |= EROFS_MAP_META; - } else { - erofs_err(inode->i_sb, - "internal error @ nid: %llu (size %llu), m_la 0x%llx", - vi->nid, inode->i_size, map->m_la); - DBG_BUGON(1); - return -EIO; - } - return 0; + err = erofs_init_metabuf(buf, sb, in_metabox); + if (err) + return ERR_PTR(err); + return erofs_bread(buf, offset, true); } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = inode->i_sb; + unsigned int unit, blksz = sb->s_blocksize; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u64 chunknr; - unsigned int unit; + erofs_blk_t startblk, addrmask; + bool tailpacking; erofs_off_t pos; - void *kaddr; + u64 chunknr; int err = 0; - trace_erofs_map_blocks_enter(inode, map, flags); + trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; - if (map->m_la >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = 0; + map->m_flags = 0; + if (map->m_la >= inode->i_size) goto out; - } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map, flags); + tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + if (!tailpacking && vi->startblk == EROFS_NULL_ADDR) + goto out; + pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking); + + map->m_flags = EROFS_MAP_MAPPED; + if (map->m_la < pos) { + map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la; + map->m_llen = pos - map->m_la; + } else { + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_llen = inode->i_size - map->m_la; + map->m_flags |= EROFS_MAP_META; + } goto out; } @@ -150,64 +127,70 @@ int erofs_map_blocks(struct inode *inode, unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ chunknr = map->m_la >> vi->chunkbits; - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); + idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode)); + if (IS_ERR(idx)) { + err = PTR_ERR(idx); goto out; } map->m_la = chunknr << vi->chunkbits; - map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, - roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); - - /* handle block map */ - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr + erofs_blkoff(pos); - - if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { - map->m_flags = 0; - } else { - map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr)); + map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits, + round_up(inode->i_size - map->m_la, blksz)); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { + addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ? + BIT_ULL(48) - 1 : BIT_ULL(32) - 1; + startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) | + le32_to_cpu(idx->startblk_lo)) & addrmask; + if ((startblk ^ EROFS_NULL_ADDR) & addrmask) { + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; + map->m_pa = erofs_pos(sb, startblk); + map->m_flags = EROFS_MAP_MAPPED; + } + } else { + startblk = le32_to_cpu(*(__le32 *)idx); + if (startblk != (u32)EROFS_NULL_ADDR) { + map->m_pa = erofs_pos(sb, startblk); map->m_flags = EROFS_MAP_MAPPED; } - goto out_unlock; - } - /* parse chunk indexes */ - idx = kaddr + erofs_blkoff(pos); - switch (le32_to_cpu(idx->blkaddr)) { - case EROFS_NULL_ADDR: - map->m_flags = 0; - break; - default: - map->m_deviceid = le16_to_cpu(idx->device_id) & - EROFS_SB(sb)->device_id_mask; - map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); - map->m_flags = EROFS_MAP_MAPPED; - break; } -out_unlock: erofs_put_metabuf(&buf); out: - if (!err) - map->m_llen = map->m_plen; - trace_erofs_map_blocks_exit(inode, map, flags, 0); + if (!err) { + map->m_plen = map->m_llen; + /* inline data should be located in the same meta block */ + if ((map->m_flags & EROFS_MAP_META) && + erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) { + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + } + trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct super_block *sb, struct erofs_device_info *dif) +{ + map->m_sb = sb; + map->m_dif = dif; + map->m_bdev = NULL; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff; int id; - /* primary device by default */ - map->m_bdev = sb->s_bdev; - map->m_daxdev = EROFS_SB(sb)->dax_dev; - map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; - map->m_fscache = EROFS_SB(sb)->s_fscache; - + erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); + map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); @@ -215,28 +198,24 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return -ENODEV; } - map->m_bdev = dif->bdev; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + if (devs->flatdev) { + map->m_pa += erofs_pos(sb, dif->uniaddr); + up_read(&devs->rwsem); + return 0; + } + erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); - } else if (devs->extra_devices) { + } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - - if (!dif->mapped_blkaddr) + if (!dif->uniaddr) continue; - startoff = blknr_to_addr(dif->mapped_blkaddr); - length = blknr_to_addr(dif->blocks); + startoff = erofs_pos(sb, dif->uniaddr); if (map->m_pa >= startoff && - map->m_pa < startoff + length) { + map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -= startoff; - map->m_bdev = dif->bdev; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, sb, dif); break; } } @@ -245,61 +224,103 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this folio + * bit 29: CPU has dirty data in D-cache (needs aliasing handling); + * bit 0 - 29: remaining parts to complete this folio + */ +#define EROFS_ONLINEFOLIO_EIO 30 +#define EROFS_ONLINEFOLIO_DIRTY 29 + +void erofs_onlinefolio_init(struct folio *folio) +{ + union { + atomic_t o; + void *v; + } u = { .o = ATOMIC_INIT(1) }; + + folio->private = u.v; /* valid only if file-backed folio is locked */ +} + +void erofs_onlinefolio_split(struct folio *folio) +{ + atomic_inc((atomic_t *)&folio->private); +} + +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty) +{ + int orig, v; + + do { + orig = atomic_read((atomic_t *)&folio->private); + DBG_BUGON(orig <= 0); + v = dirty << EROFS_ONLINEFOLIO_DIRTY; + v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO); + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); + + if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1)) + return; + folio->private = 0; + if (v & BIT(EROFS_ONLINEFOLIO_DIRTY)) + flush_dcache_folio(folio); + folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO))); +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; + struct super_block *sb = inode->i_sb; struct erofs_map_blocks map; struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; - - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; - mdev = (struct erofs_map_dev) { - .m_deviceid = map.m_deviceid, - .m_pa = map.m_pa, - }; - ret = erofs_map_dev(inode->i_sb, &mdev); - if (ret) - return ret; - iomap->offset = map.m_la; - if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_daxdev; - else - iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; - + iomap->addr = IOMAP_NULL_ADDR; if (!(map.m_flags & EROFS_MAP_MAPPED)) { iomap->type = IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - if (!iomap->length) - iomap->length = length; return 0; } + if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) { + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(sb, &mdev); + if (ret) + return ret; + + if (flags & IOMAP_DAX) + iomap->dax_dev = mdev.m_dif->dax_dev; + else + iomap->bdev = mdev.m_bdev; + iomap->addr = mdev.m_dif->fsoff + mdev.m_pa; + if (flags & IOMAP_DAX) + iomap->addr += mdev.m_dif->dax_part_off; + } + if (map.m_flags & EROFS_MAP_META) { void *ptr; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, inode->i_sb, - erofs_blknr(mdev.m_pa), EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, map.m_pa, + erofs_inode_in_metabox(inode)); if (IS_ERR(ptr)) return PTR_ERR(ptr); - iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa); + iomap->inline_data = ptr; iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = mdev.m_pa; - if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dax_part_off; } return 0; } @@ -313,7 +334,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, struct erofs_buf buf = { .page = kmap_to_page(ptr), .base = ptr, - .kmap_type = EROFS_KMAP, }; DBG_BUGON(iomap->type != IOMAP_INLINE); @@ -349,12 +369,18 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { - return iomap_read_folio(folio, &erofs_iomap_ops); + trace_erofs_read_folio(folio, true); + + iomap_bio_read_folio(folio, &erofs_iomap_ops); + return 0; } static void erofs_readahead(struct readahead_control *rac) { - return iomap_readahead(rac, &erofs_iomap_ops); + trace_erofs_readahead(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + + iomap_bio_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) @@ -374,27 +400,14 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif - if (iocb->ki_flags & IOCB_DIRECT) { - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = bdev_logical_block_size(bdev) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - - if ((iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to)) & blksize_mask) - return -EINVAL; - + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, NULL, 0, NULL, 0); - } return filemap_read(iocb, to, 0); } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .read_folio = erofs_read_folio, .readahead = erofs_readahead, .bmap = erofs_bmap, @@ -405,14 +418,14 @@ const struct address_space_operations erofs_raw_access_aops = { #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); + return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { @@ -420,25 +433,54 @@ static const struct vm_operations_struct erofs_dax_vm_ops = { .huge_fault = erofs_dax_huge_fault, }; -static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int erofs_file_mmap_prepare(struct vm_area_desc *desc) { - if (!IS_DAX(file_inode(file))) - return generic_file_readonly_mmap(file, vma); + if (!IS_DAX(file_inode(desc->file))) + return generic_file_readonly_mmap_prepare(desc); - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) return -EINVAL; - vma->vm_ops = &erofs_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + desc->vm_ops = &erofs_dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; return 0; } #else -#define erofs_file_mmap generic_file_readonly_mmap +#define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare +#endif + +static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + const struct iomap_ops *ops = &erofs_iomap_ops; + + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) +#ifdef CONFIG_EROFS_FS_ZIP + ops = &z_erofs_iomap_report_ops; +#else + return generic_file_llseek(file, offset, whence); #endif + if (whence == SEEK_HOLE) + offset = iomap_seek_hole(inode, offset, ops); + else if (whence == SEEK_DATA) + offset = iomap_seek_data(inode, offset, ops); + else + return generic_file_llseek(file, offset, whence); + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + const struct file_operations erofs_file_fops = { - .llseek = generic_file_llseek, + .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, - .mmap = erofs_file_mmap, - .splice_read = generic_file_splice_read, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif + .mmap_prepare = erofs_file_mmap_prepare, + .get_unmapped_area = thp_get_unmapped_area, + .splice_read = filemap_splice_read, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 51b7ac7166d9..d5d090276391 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -2,33 +2,18 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2024 Alibaba Cloud */ #include "compress.h" -#include <linux/module.h> #include <linux/lz4.h> -#ifndef LZ4_DISTANCE_MAX /* history window size */ -#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ -#endif - #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) -#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN -#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) -#endif - -struct z_erofs_lz4_decompress_ctx { - struct z_erofs_decompress_req *rq; - /* # of encoded, decoded pages */ - unsigned int inpages, outpages; - /* decoded block total length (used for in-place decompression) */ - unsigned int oend; -}; -int z_erofs_load_lz4_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lz4_cfgs *lz4, int size) +static int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) { struct erofs_sb_info *sbi = EROFS_SB(sb); + struct z_erofs_lz4_cfgs *lz4 = data; u16 distance; if (lz4) { @@ -42,7 +27,7 @@ int z_erofs_load_lz4_config(struct super_block *sb, if (!sbi->lz4.max_pclusterblks) { sbi->lz4.max_pclusterblks = 1; /* reserved case */ } else if (sbi->lz4.max_pclusterblks > - Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) { erofs_err(sb, "too large lz4 pclusterblks %u", sbi->lz4.max_pclusterblks); return -EINVAL; @@ -55,17 +40,16 @@ int z_erofs_load_lz4_config(struct super_block *sb, sbi->lz4.max_distance_pages = distance ? DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : LZ4_MAX_DISTANCE_PAGES; - return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); + return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks); } /* * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ -static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; @@ -75,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, unsigned int i, j, top; top = 0; - for (i = j = 0; i < ctx->outpages; ++i, ++j) { + for (i = j = 0; i < rq->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; @@ -110,10 +94,10 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (top) { victim = availables[--top]; - get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = __erofs_allocpage(pagepool, rq->gfp, true); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -121,65 +105,72 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, return kaddr ? 1 : 0; } -static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, - void *inpage, unsigned int *inputmargin, int *maptype, - bool may_inplace) +static void *z_erofs_lz4_handle_overlap(const struct z_erofs_decompress_req *rq, + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) { - struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i, j; + unsigned int oend, omargin, cnt, i; struct page **in; - void *src, *tmp; - - if (rq->inplace_io) { - omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; - if (rq->partial_decoding || !may_inplace || - omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) - goto docopy; - - for (i = 0; i < ctx->inpages; ++i) { - DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) - if (rq->out[j] == rq->in[i]) - goto docopy; + void *src; + + /* + * If in-place I/O isn't used, for example, the bounce compressed cache + * can hold data for incomplete read requests. Just map the compressed + * buffer as well and decompress directly. + */ + if (!rq->inplace_io) { + if (rq->inpages <= 1) { + *maptype = 0; + return inpage; } + kunmap_local(inpage); + src = erofs_vm_map_ram(rq->in, rq->inpages); + if (!src) + return ERR_PTR(-ENOMEM); + *maptype = 1; + return src; } - - if (ctx->inpages <= 1) { - *maptype = 0; - return inpage; + /* + * Then, deal with in-place I/Os. The reasons why in-place I/O is useful + * are: (1) It minimizes memory footprint during the I/O submission, + * which is useful for slow storage (including network devices and + * low-end HDDs/eMMCs) but with a lot inflight I/Os; (2) If in-place + * decompression can also be applied, it will reuse the unique buffer so + * that no extra CPU D-cache is polluted with temporary compressed data + * for extreme performance. + */ + oend = rq->pageofs_out + rq->outputsize; + omargin = PAGE_ALIGN(oend) - oend; + if (!rq->partial_decoding && may_inplace && + omargin >= LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) { + for (i = 0; i < rq->inpages; ++i) + if (rq->out[rq->outpages - rq->inpages + i] != + rq->in[i]) + break; + if (i >= rq->inpages) { + kunmap_local(inpage); + *maptype = 3; + return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT); + } } - kunmap_atomic(inpage); - might_sleep(); - src = erofs_vm_map_ram(rq->in, ctx->inpages); - if (!src) - return ERR_PTR(-ENOMEM); - *maptype = 1; - return src; - -docopy: - /* Or copy compressed data which can be overlapped to per-CPU buffer */ - in = rq->in; - src = erofs_get_pcpubuf(ctx->inpages); + /* + * If in-place decompression can't be applied, copy compressed data that + * may potentially overlap during decompression to a per-CPU buffer. + */ + src = z_erofs_get_gbuf(rq->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } - tmp = src; - total = rq->inputsize; - while (total) { - unsigned int page_copycnt = - min_t(unsigned int, total, PAGE_SIZE - *inputmargin); - + for (i = 0, in = rq->in; i < rq->inputsize; i += cnt, ++in) { + cnt = min_t(u32, rq->inputsize - i, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); - memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + inpage = kmap_local_page(*in); + memcpy(src + i, inpage + *inputmargin, cnt); + kunmap_local(inpage); inpage = NULL; - tmp += page_copycnt; - total -= page_copycnt; - ++in; *inputmargin = 0; } *maptype = 2; @@ -187,55 +178,55 @@ docopy: } /* - * Get the exact inputsize with zero_padding feature. - * - For LZ4, it should work if zero_padding feature is on (5.3+); - * - For MicroLZMA, it'd be enabled all the time. + * Get the exact on-disk size of the compressed data: + * - For LZ4, it should apply if the zero_padding feature is on (5.3+); + * - For others, zero_padding is enabled all the time. */ -int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, - unsigned int padbufsize) +const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, + const char *padbuf, unsigned int padbufsize) { const char *padend; padend = memchr_inv(padbuf, 0, padbufsize); if (!padend) - return -EFSCORRUPTED; + return "compressed data start not found"; rq->inputsize -= padend - padbuf; rq->pageofs_in += padend - padbuf; - return 0; + return NULL; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *out) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst) { - struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; - u8 *headpage, *src; + u8 *out, *headpage, *src; + const char *reason; int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { support_0padding = true; - ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, min_t(unsigned int, rq->inputsize, - EROFS_BLKSIZ - rq->pageofs_in)); - if (ret) { - kunmap_atomic(headpage); - return ret; + rq->sb->s_blocksize - rq->pageofs_in)); + if (reason) { + kunmap_local(headpage); + return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED; } may_inplace = !((rq->pageofs_in + rq->inputsize) & - (EROFS_BLKSIZ - 1)); + (rq->sb->s_blocksize - 1)); } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); + out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, @@ -245,147 +236,290 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, rq->inputsize, rq->outputsize); if (ret != rq->outputsize) { - erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", - ret, rq->inputsize, inputmargin, rq->outputsize); - - print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, - 16, 1, src + inputmargin, rq->inputsize, true); - print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, - 16, 1, out, rq->outputsize, true); - if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); - ret = -EIO; + ret = -EFSCORRUPTED; } else { ret = 0; } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { - vm_unmap_ram(src, ctx->inpages); + vm_unmap_ram(src, rq->inpages); } else if (maptype == 2) { - erofs_put_pcpubuf(src); - } else { + z_erofs_put_gbuf(src); + } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } return ret; } -static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static const char *z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) { - struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; - ctx.rq = rq; - ctx.oend = rq->pageofs_out + rq->outputsize; - ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; - ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - /* one optimized fast path only for non bigpcluster cases yet */ - if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { + if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; - goto dstmap_out; - } - - /* general decoding path which can be used for all cases */ - ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); - if (ret < 0) { - return ret; - } else if (ret > 0) { - dst = page_address(*rq->out); - dst_maptype = 1; } else { - dst = erofs_vm_map_ram(rq->out, ctx.outpages); - if (!dst) - return -ENOMEM; - dst_maptype = 2; + /* general decoding path which can be used for all cases */ + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + dst = page_address(*rq->out); + dst_maptype = 1; + } else { + dst = erofs_vm_map_ram(rq->out, rq->outpages); + if (!dst) + return ERR_PTR(-ENOMEM); + dst_maptype = 2; + } } - -dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(rq, dst); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) - vm_unmap_ram(dst, ctx.outpages); - return ret; + vm_unmap_ram(dst, rq->outpages); + return ERR_PTR(ret); } -static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static const char *z_erofs_transform_plain(struct z_erofs_decompress_req *rq, + struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; - - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; + const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; + + if (rq->outputsize > rq->inputsize) + return ERR_PTR(-EOPNOTSUPP); + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) + memmove(kin + rq->pageofs_out, kin + pi, cur); + else + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + kunmap_local(kin); + } + rq->outputsize -= cur; } - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; + for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) { + insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min(insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + else if (rq->out[no]) + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + pi += cnt; + } while (pi < insz); + kunmap_local(kin); } + DBG_BUGON(ni > nrpages_in); + return NULL; +} - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); +const char *z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, + void **dst, void **src, struct page **pgpl) +{ + struct z_erofs_decompress_req *rq = dctx->rq; + struct page **pgo, *tmppage; + unsigned int j; + + if (!dctx->avail_out) { + if (++dctx->no >= rq->outpages || !rq->outputsize) + return "insufficient space for decompressed data"; + + if (dctx->kout) + kunmap_local(dctx->kout); + dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out); + rq->outputsize -= dctx->avail_out; + pgo = &rq->out[dctx->no]; + if (!*pgo && rq->fillgaps) { /* deduped */ + *pgo = erofs_allocpage(pgpl, rq->gfp); + if (!*pgo) { + dctx->kout = NULL; + return ERR_PTR(-ENOMEM); + } + set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE); + } + if (*pgo) { + dctx->kout = kmap_local_page(*pgo); + *dst = dctx->kout + rq->pageofs_out; + } else { + *dst = dctx->kout = NULL; + } + rq->pageofs_out = 0; } - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); + if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) { + if (++dctx->ni >= rq->inpages) + return "invalid compressed data"; + if (dctx->kout) /* unlike kmap(), take care of the orders */ + kunmap_local(dctx->kout); + kunmap_local(dctx->kin); + + dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE); + rq->inputsize -= dctx->inbuf_sz; + dctx->kin = kmap_local_page(rq->in[dctx->ni]); + *src = dctx->kin; + dctx->bounced = false; + if (dctx->kout) { + j = (u8 *)*dst - dctx->kout; + dctx->kout = kmap_local_page(rq->out[dctx->no]); + *dst = dctx->kout + j; } + dctx->inbuf_pos = 0; } - kunmap_local(src); - return 0; + + /* + * Handle overlapping: Use the given bounce buffer if the input data is + * under processing; Or utilize short-lived pages from the on-stack page + * pool, where pages are shared among the same request. Note that only + * a few inplace I/O pages need to be doubled. + */ + if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) { + memcpy(dctx->bounce, *src, dctx->inbuf_sz); + *src = dctx->bounce; + dctx->bounced = true; + } + + for (j = dctx->ni + 1; j < rq->inpages; ++j) { + if (rq->out[dctx->no] != rq->in[j]) + continue; + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) + return ERR_PTR(-ENOMEM); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + return NULL; } -static struct z_erofs_decompressor decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { +const struct z_erofs_decompressor *z_erofs_decomp[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) { .decompress = z_erofs_transform_plain, .name = "shifted" }, - [Z_EROFS_COMPRESSION_INTERLACED] = { + [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) { .decompress = z_erofs_transform_plain, .name = "interlaced" }, - [Z_EROFS_COMPRESSION_LZ4] = { + [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) { + .config = z_erofs_load_lz4_config, .decompress = z_erofs_lz4_decompress, + .init = z_erofs_gbuf_init, + .exit = z_erofs_gbuf_exit, .name = "lz4" }, #ifdef CONFIG_EROFS_FS_ZIP_LZMA - [Z_EROFS_COMPRESSION_LZMA] = { - .decompress = z_erofs_lzma_decompress, - .name = "lzma" - }, + [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp, +#endif +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE + [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp, +#endif +#ifdef CONFIG_EROFS_FS_ZIP_ZSTD + [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp, #endif }; -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int algs, alg; + erofs_off_t offset; + int size, ret = 0; + + if (!erofs_sb_has_compr_cfgs(sbi)) { + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; + return z_erofs_load_lz4_config(sb, dsb, NULL, 0); + } + + sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); + if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { + erofs_err(sb, "unidentified algorithms %x, please upgrade kernel", + sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); + return -EOPNOTSUPP; + } + + (void)erofs_init_metabuf(&buf, sb, false); + offset = EROFS_SUPER_OFFSET + sbi->sb_size; + alg = 0; + for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + const struct z_erofs_decompressor *dec = z_erofs_decomp[alg]; + void *data; + + if (!(algs & 1)) + continue; + + data = erofs_read_metadata(sb, &buf, &offset, &size); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + + if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) { + ret = dec->config(sb, dsb, data, size); + } else { + erofs_err(sb, "algorithm %d isn't enabled on this kernel", + alg); + ret = -EOPNOTSUPP; + } + kfree(data); + if (ret) + break; + } + erofs_put_metabuf(&buf); + return ret; +} + +int __init z_erofs_init_decompressor(void) +{ + int i, err; + + for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) { + err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0; + if (err) { + while (i--) + if (z_erofs_decomp[i]) + z_erofs_decomp[i]->exit(); + return err; + } + } + return 0; +} + +void z_erofs_exit_decompressor(void) { - return decompressors[rq->alg].decompress(rq, pagepool); + int i; + + for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) + if (z_erofs_decomp[i]) + z_erofs_decomp[i]->exit(); } diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c new file mode 100644 index 000000000000..5ef6f71d3b7f --- /dev/null +++ b/fs/erofs/decompressor_crypto.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/scatterlist.h> +#include <crypto/acompress.h> +#include "compress.h" + +static int __z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq, + struct crypto_acomp *tfm) +{ + struct sg_table st_src, st_dst; + struct acomp_req *req; + struct crypto_wait wait; + const char *reason; + u8 *headpage; + int ret; + + headpage = kmap_local_page(*rq->in); + reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); + kunmap_local(headpage); + if (reason) + return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED; + + req = acomp_request_alloc(tfm); + if (!req) + return -ENOMEM; + + ret = sg_alloc_table_from_pages_segment(&st_src, rq->in, rq->inpages, + rq->pageofs_in, rq->inputsize, UINT_MAX, GFP_KERNEL); + if (ret < 0) + goto failed_src_alloc; + + ret = sg_alloc_table_from_pages_segment(&st_dst, rq->out, rq->outpages, + rq->pageofs_out, rq->outputsize, UINT_MAX, GFP_KERNEL); + if (ret < 0) + goto failed_dst_alloc; + + acomp_request_set_params(req, st_src.sgl, + st_dst.sgl, rq->inputsize, rq->outputsize); + + crypto_init_wait(&wait); + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + + ret = crypto_wait_req(crypto_acomp_decompress(req), &wait); + if (ret) { + erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", + ret, rq->inputsize, rq->pageofs_in, rq->outputsize); + ret = -EIO; + } + + sg_free_table(&st_dst); +failed_dst_alloc: + sg_free_table(&st_src); +failed_src_alloc: + acomp_request_free(req); + return ret; +} + +struct z_erofs_crypto_engine { + char *crypto_name; + struct crypto_acomp *tfm; +}; + +struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = { + [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) { + {}, + }, + [Z_EROFS_COMPRESSION_LZMA] = (struct z_erofs_crypto_engine[]) { + {}, + }, + [Z_EROFS_COMPRESSION_DEFLATE] = (struct z_erofs_crypto_engine[]) { + { .crypto_name = "qat_deflate", }, + {}, + }, + [Z_EROFS_COMPRESSION_ZSTD] = (struct z_erofs_crypto_engine[]) { + {}, + }, +}; +static DECLARE_RWSEM(z_erofs_crypto_rwsem); + +static struct crypto_acomp *z_erofs_crypto_get_engine(int alg) +{ + struct z_erofs_crypto_engine *e; + + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) + if (e->tfm) + return e->tfm; + return NULL; +} + +int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ + struct crypto_acomp *tfm; + int i, err; + + down_read(&z_erofs_crypto_rwsem); + tfm = z_erofs_crypto_get_engine(rq->alg); + if (!tfm) { + err = -EOPNOTSUPP; + goto out; + } + + for (i = 0; i < rq->outpages; i++) { + struct page *const page = rq->out[i]; + struct page *victim; + + if (!page) { + victim = __erofs_allocpage(pgpl, rq->gfp, true); + if (!victim) { + err = -ENOMEM; + goto out; + } + set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); + rq->out[i] = victim; + } + } + err = __z_erofs_crypto_decompress(rq, tfm); +out: + up_read(&z_erofs_crypto_rwsem); + return err; +} + +int z_erofs_crypto_enable_engine(const char *name, int len) +{ + struct z_erofs_crypto_engine *e; + struct crypto_acomp *tfm; + int alg; + + down_write(&z_erofs_crypto_rwsem); + for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) { + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) { + if (!strncmp(name, e->crypto_name, len)) { + if (e->tfm) + break; + tfm = crypto_alloc_acomp(e->crypto_name, 0, 0); + if (IS_ERR(tfm)) { + up_write(&z_erofs_crypto_rwsem); + return -EOPNOTSUPP; + } + e->tfm = tfm; + break; + } + } + } + up_write(&z_erofs_crypto_rwsem); + return 0; +} + +void z_erofs_crypto_disable_all_engines(void) +{ + struct z_erofs_crypto_engine *e; + int alg; + + down_write(&z_erofs_crypto_rwsem); + for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) { + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) { + if (!e->tfm) + continue; + crypto_free_acomp(e->tfm); + e->tfm = NULL; + } + } + up_write(&z_erofs_crypto_rwsem); +} + +int z_erofs_crypto_show_engines(char *buf, int size, char sep) +{ + struct z_erofs_crypto_engine *e; + int alg, len = 0; + + for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) { + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) { + if (!e->tfm) + continue; + len += scnprintf(buf + len, size - len, "%s%c", + e->crypto_name, sep); + } + } + return len; +} diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c new file mode 100644 index 000000000000..3fb73000ed27 --- /dev/null +++ b/fs/erofs/decompressor_deflate.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/zlib.h> +#include "compress.h" + +struct z_erofs_deflate { + struct z_erofs_deflate *next; + struct z_stream_s z; + u8 bounce[PAGE_SIZE]; +}; + +static DEFINE_SPINLOCK(z_erofs_deflate_lock); +static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms; +static struct z_erofs_deflate *z_erofs_deflate_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); + +module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); + +static void z_erofs_deflate_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + continue; + } + z_erofs_deflate_head = NULL; + spin_unlock(&z_erofs_deflate_lock); + + while (strm) { + struct z_erofs_deflate *n = strm->next; + + vfree(strm->z.workspace); + kfree(strm); + --z_erofs_deflate_avail_strms; + strm = n; + } + } +} + +static int __init z_erofs_deflate_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_deflate_nstrms) + z_erofs_deflate_nstrms = num_possible_cpus(); + return 0; +} + +static int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + struct z_erofs_deflate_cfgs *dfl = data; + static DEFINE_MUTEX(deflate_resize_mutex); + static bool inited; + + if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) { + erofs_err(sb, "invalid deflate cfgs, size=%u", size); + return -EINVAL; + } + + if (dfl->windowbits > MAX_WBITS) { + erofs_err(sb, "unsupported windowbits %u", dfl->windowbits); + return -EOPNOTSUPP; + } + mutex_lock(&deflate_resize_mutex); + if (!inited) { + for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms; + ++z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) + goto failed; + /* XXX: in-kernel zlib cannot customize windowbits */ + strm->z.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!strm->z.workspace) { + kfree(strm); + goto failed; + } + + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + } + inited = true; + } + mutex_unlock(&deflate_resize_mutex); + erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); + return 0; +failed: + mutex_unlock(&deflate_resize_mutex); + z_erofs_deflate_exit(); + return -ENOMEM; +} + +static const char *__z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ + struct super_block *sb = rq->sb; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; + struct z_erofs_deflate *strm; + const char *reason; + int zerr; + + /* 1. get the exact DEFLATE compressed size */ + dctx.kin = kmap_local_page(*rq->in); + reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in, + min(rq->inputsize, sb->s_blocksize - rq->pageofs_in)); + if (reason) { + kunmap_local(dctx.kin); + return reason; + } + + /* 2. get an available DEFLATE context */ +again: + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head)); + goto again; + } + z_erofs_deflate_head = strm->next; + spin_unlock(&z_erofs_deflate_lock); + + /* 3. multi-call decompress */ + zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); + if (zerr != Z_OK) { + reason = ERR_PTR(-EINVAL); + goto failed_zinit; + } + + rq->fillgaps = true; /* DEFLATE doesn't support NULL output buffer */ + strm->z.avail_in = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in); + rq->inputsize -= strm->z.avail_in; + strm->z.next_in = dctx.kin + rq->pageofs_in; + strm->z.avail_out = 0; + dctx.bounce = strm->bounce; + + while (1) { + dctx.avail_out = strm->z.avail_out; + dctx.inbuf_sz = strm->z.avail_in; + reason = z_erofs_stream_switch_bufs(&dctx, + (void **)&strm->z.next_out, + (void **)&strm->z.next_in, pgpl); + if (reason) + break; + strm->z.avail_out = dctx.avail_out; + strm->z.avail_in = dctx.inbuf_sz; + + zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); + if (zerr != Z_OK || !(rq->outputsize + strm->z.avail_out)) { + if (zerr == Z_OK && rq->partial_decoding) + break; + if (zerr == Z_STREAM_END && !rq->outputsize) + break; + reason = (zerr == Z_DATA_ERROR ? + "corrupted compressed data" : + "unexpected end of stream"); + break; + } + } + if (zlib_inflateEnd(&strm->z) != Z_OK && !reason) + reason = ERR_PTR(-EIO); + if (dctx.kout) + kunmap_local(dctx.kout); +failed_zinit: + kunmap_local(dctx.kin); + /* 4. push back DEFLATE stream context to the global list */ + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + wake_up(&z_erofs_deflate_wq); + return reason; +} + +static const char *z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL + int err; + + if (!rq->partial_decoding) { + err = z_erofs_crypto_decompress(rq, pgpl); + if (err != -EOPNOTSUPP) + return ERR_PTR(err); + + } +#endif + return __z_erofs_deflate_decompress(rq, pgpl); +} + +const struct z_erofs_decompressor z_erofs_deflate_decomp = { + .config = z_erofs_load_deflate_config, + .decompress = z_erofs_deflate_decompress, + .init = z_erofs_deflate_init, + .exit = z_erofs_deflate_exit, + .name = "deflate", +}; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 091fd5adf818..b4ea6978faae 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -1,12 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/xz.h> -#include <linux/module.h> #include "compress.h" struct z_erofs_lzma { struct z_erofs_lzma *next; struct xz_dec_microlzma *state; - struct xz_buf buf; u8 bounce[PAGE_SIZE]; }; @@ -19,7 +17,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); -void z_erofs_lzma_exit(void) +static void z_erofs_lzma_exit(void) { /* there should be no running fs instance */ while (z_erofs_lzma_avail_strms) { @@ -47,7 +45,7 @@ void z_erofs_lzma_exit(void) } } -int z_erofs_lzma_init(void) +static int __init z_erofs_lzma_init(void) { unsigned int i; @@ -71,11 +69,11 @@ int z_erofs_lzma_init(void) return 0; } -int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lzma_cfgs *lzma, int size) +static int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) { static DEFINE_MUTEX(lzma_resize_mutex); + struct z_erofs_lzma_cfgs *lzma = data; unsigned int dict_size, i; struct z_erofs_lzma *strm, *head = NULL; int err; @@ -96,8 +94,6 @@ int z_erofs_load_lzma_config(struct super_block *sb, return -EINVAL; } - erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); - /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ mutex_lock(&lzma_resize_mutex); @@ -150,27 +146,23 @@ again: return err; } -int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static const char *z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) { - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int nrpages_in = - PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - unsigned int inlen, outlen, pageofs; + struct super_block *sb = rq->sb; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; + struct xz_buf buf = {}; struct z_erofs_lzma *strm; - u8 *kin; - bool bounced = false; - int no, ni, j, err = 0; + enum xz_ret xz_err; + const char *reason; /* 1. get the exact LZMA compressed size */ - kin = kmap(*rq->in); - err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, - min_t(unsigned int, rq->inputsize, - EROFS_BLKSIZ - rq->pageofs_in)); - if (err) { - kunmap(*rq->in); - return err; + dctx.kin = kmap_local_page(*rq->in); + reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in, + min(rq->inputsize, sb->s_blocksize - rq->pageofs_in)); + if (reason) { + kunmap_local(dctx.kin); + return reason; } /* 2. get an available lzma context */ @@ -186,106 +178,58 @@ again: spin_unlock(&z_erofs_lzma_lock); /* 3. multi-call decompress */ - inlen = rq->inputsize; - outlen = rq->outputsize; - xz_dec_microlzma_reset(strm->state, inlen, outlen, + xz_dec_microlzma_reset(strm->state, rq->inputsize, rq->outputsize, !rq->partial_decoding); - pageofs = rq->pageofs_out; - strm->buf.in = kin + rq->pageofs_in; - strm->buf.in_pos = 0; - strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in); - inlen -= strm->buf.in_size; - strm->buf.out = NULL; - strm->buf.out_pos = 0; - strm->buf.out_size = 0; - - for (ni = 0, no = -1;;) { - enum xz_ret xz_err; - - if (strm->buf.out_pos == strm->buf.out_size) { - if (strm->buf.out) { - kunmap(rq->out[no]); - strm->buf.out = NULL; - } - - if (++no >= nrpages_out || !outlen) { - erofs_err(rq->sb, "decompressed buf out of bound"); - err = -EFSCORRUPTED; - break; - } - strm->buf.out_pos = 0; - strm->buf.out_size = min_t(u32, outlen, - PAGE_SIZE - pageofs); - outlen -= strm->buf.out_size; - if (!rq->out[no] && rq->fillgaps) /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); - if (rq->out[no]) - strm->buf.out = kmap(rq->out[no]) + pageofs; - pageofs = 0; - } else if (strm->buf.in_pos == strm->buf.in_size) { - kunmap(rq->in[ni]); - - if (++ni >= nrpages_in || !inlen) { - erofs_err(rq->sb, "compressed buf out of bound"); - err = -EFSCORRUPTED; - break; - } - strm->buf.in_pos = 0; - strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); - inlen -= strm->buf.in_size; - kin = kmap(rq->in[ni]); - strm->buf.in = kin; - bounced = false; - } + buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in); + rq->inputsize -= buf.in_size; + buf.in = dctx.kin + rq->pageofs_in; + dctx.bounce = strm->bounce; + do { + dctx.avail_out = buf.out_size - buf.out_pos; + dctx.inbuf_sz = buf.in_size; + dctx.inbuf_pos = buf.in_pos; + reason = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out, + (void **)&buf.in, pgpl); + if (reason) + break; - /* - * Handle overlapping: Use bounced buffer if the compressed - * data is under processing; Otherwise, Use short-lived pages - * from the on-stack pagepool where pages share with the same - * request. - */ - if (!bounced && rq->out[no] == rq->in[ni]) { - memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); - strm->buf.in = strm->bounce; - bounced = true; + if (buf.out_size == buf.out_pos) { + buf.out_size = dctx.avail_out; + buf.out_pos = 0; } - for (j = ni + 1; j < nrpages_in; ++j) { - struct page *tmppage; - - if (rq->out[no] != rq->in[j]) - continue; + buf.in_size = dctx.inbuf_sz; + buf.in_pos = dctx.inbuf_pos; - DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), - rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); - set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); - copy_highpage(tmppage, rq->in[j]); - rq->in[j] = tmppage; - } - xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); - DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); - DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + xz_err = xz_dec_microlzma_run(strm->state, &buf); + DBG_BUGON(buf.out_pos > buf.out_size); + DBG_BUGON(buf.in_pos > buf.in_size); if (xz_err != XZ_OK) { - if (xz_err == XZ_STREAM_END && !outlen) + if (xz_err == XZ_STREAM_END && !rq->outputsize) break; - erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", - xz_err, rq->inputsize, rq->outputsize); - err = -EFSCORRUPTED; + reason = (xz_err == XZ_DATA_ERROR ? + "corrupted compressed data" : + "unexpected end of stream"); break; } - } - if (no < nrpages_out && strm->buf.out) - kunmap(rq->in[no]); - if (ni < nrpages_in) - kunmap(rq->in[ni]); + } while (1); + + if (dctx.kout) + kunmap_local(dctx.kout); + kunmap_local(dctx.kin); /* 4. push back LZMA stream context to the global list */ spin_lock(&z_erofs_lzma_lock); strm->next = z_erofs_lzma_head; z_erofs_lzma_head = strm; spin_unlock(&z_erofs_lzma_lock); wake_up(&z_erofs_lzma_wq); - return err; + return reason; } + +const struct z_erofs_decompressor z_erofs_lzma_decomp = { + .config = z_erofs_load_lzma_config, + .decompress = z_erofs_lzma_decompress, + .init = z_erofs_lzma_init, + .exit = z_erofs_lzma_exit, + .name = "lzma" +}; diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c new file mode 100644 index 000000000000..beae49165c69 --- /dev/null +++ b/fs/erofs/decompressor_zstd.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/zstd.h> +#include "compress.h" + +struct z_erofs_zstd { + struct z_erofs_zstd *next; + u8 bounce[PAGE_SIZE]; + void *wksp; + unsigned int wkspsz; +}; + +static DEFINE_SPINLOCK(z_erofs_zstd_lock); +static unsigned int z_erofs_zstd_max_dictsize; +static unsigned int z_erofs_zstd_nstrms, z_erofs_zstd_avail_strms; +static struct z_erofs_zstd *z_erofs_zstd_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_zstd_wq); + +module_param_named(zstd_streams, z_erofs_zstd_nstrms, uint, 0444); + +static struct z_erofs_zstd *z_erofs_isolate_strms(bool all) +{ + struct z_erofs_zstd *strm; + +again: + spin_lock(&z_erofs_zstd_lock); + strm = z_erofs_zstd_head; + if (!strm) { + spin_unlock(&z_erofs_zstd_lock); + wait_event(z_erofs_zstd_wq, READ_ONCE(z_erofs_zstd_head)); + goto again; + } + z_erofs_zstd_head = all ? NULL : strm->next; + spin_unlock(&z_erofs_zstd_lock); + return strm; +} + +static void z_erofs_zstd_exit(void) +{ + while (z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *strm, *n; + + for (strm = z_erofs_isolate_strms(true); strm; strm = n) { + n = strm->next; + + kvfree(strm->wksp); + kfree(strm); + --z_erofs_zstd_avail_strms; + } + } +} + +static int __init z_erofs_zstd_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_zstd_nstrms) + z_erofs_zstd_nstrms = num_possible_cpus(); + + for (; z_erofs_zstd_avail_strms < z_erofs_zstd_nstrms; + ++z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) { + z_erofs_zstd_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_zstd_lock); + strm->next = z_erofs_zstd_head; + z_erofs_zstd_head = strm; + spin_unlock(&z_erofs_zstd_lock); + } + return 0; +} + +static int z_erofs_load_zstd_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + static DEFINE_MUTEX(zstd_resize_mutex); + struct z_erofs_zstd_cfgs *zstd = data; + unsigned int dict_size, wkspsz; + struct z_erofs_zstd *strm, *head = NULL; + void *wksp; + + if (!zstd || size < sizeof(struct z_erofs_zstd_cfgs) || zstd->format) { + erofs_err(sb, "unsupported zstd format, size=%u", size); + return -EINVAL; + } + + if (zstd->windowlog > ilog2(Z_EROFS_ZSTD_MAX_DICT_SIZE) - 10) { + erofs_err(sb, "unsupported zstd window log %u", zstd->windowlog); + return -EINVAL; + } + dict_size = 1U << (zstd->windowlog + 10); + + /* in case 2 z_erofs_load_zstd_config() race to avoid deadlock */ + mutex_lock(&zstd_resize_mutex); + if (z_erofs_zstd_max_dictsize >= dict_size) { + mutex_unlock(&zstd_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + while (z_erofs_zstd_avail_strms) { + struct z_erofs_zstd *n; + + for (strm = z_erofs_isolate_strms(true); strm; strm = n) { + n = strm->next; + strm->next = head; + head = strm; + --z_erofs_zstd_avail_strms; + } + } + + /* 2. walk each isolated stream and grow max dict_size if needed */ + wkspsz = zstd_dstream_workspace_bound(dict_size); + for (strm = head; strm; strm = strm->next) { + wksp = kvmalloc(wkspsz, GFP_KERNEL); + if (!wksp) + break; + kvfree(strm->wksp); + strm->wksp = wksp; + strm->wkspsz = wkspsz; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_zstd_lock); + DBG_BUGON(z_erofs_zstd_head); + z_erofs_zstd_head = head; + spin_unlock(&z_erofs_zstd_lock); + z_erofs_zstd_avail_strms = z_erofs_zstd_nstrms; + wake_up_all(&z_erofs_zstd_wq); + if (!strm) + z_erofs_zstd_max_dictsize = dict_size; + mutex_unlock(&zstd_resize_mutex); + return strm ? -ENOMEM : 0; +} + +static const char *z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ + struct super_block *sb = rq->sb; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; + zstd_in_buffer in_buf = { NULL, 0, 0 }; + zstd_out_buffer out_buf = { NULL, 0, 0 }; + struct z_erofs_zstd *strm; + zstd_dstream *stream; + const char *reason; + int zerr; + + /* 1. get the exact compressed size */ + dctx.kin = kmap_local_page(*rq->in); + reason = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in, + min(rq->inputsize, sb->s_blocksize - rq->pageofs_in)); + if (reason) { + kunmap_local(dctx.kin); + return reason; + } + + /* 2. get an available ZSTD context */ + strm = z_erofs_isolate_strms(false); + + /* 3. multi-call decompress */ + stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz); + if (!stream) { + reason = ERR_PTR(-ENOMEM); + goto failed_zinit; + } + + rq->fillgaps = true; /* ZSTD doesn't support NULL output buffer */ + in_buf.size = min_t(u32, rq->inputsize, PAGE_SIZE - rq->pageofs_in); + rq->inputsize -= in_buf.size; + in_buf.src = dctx.kin + rq->pageofs_in; + dctx.bounce = strm->bounce; + + do { + dctx.inbuf_sz = in_buf.size; + dctx.inbuf_pos = in_buf.pos; + reason = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst, + (void **)&in_buf.src, pgpl); + if (reason) + break; + + if (out_buf.size == out_buf.pos) { + out_buf.size = dctx.avail_out; + out_buf.pos = 0; + } + in_buf.size = dctx.inbuf_sz; + in_buf.pos = dctx.inbuf_pos; + + zerr = zstd_decompress_stream(stream, &out_buf, &in_buf); + dctx.avail_out = out_buf.size - out_buf.pos; + if (zstd_is_error(zerr) || + ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 && + !(rq->inputsize + in_buf.size - in_buf.pos))))) { + reason = zstd_is_error(zerr) ? zstd_get_error_name(zerr) : + "unexpected end of stream"; + break; + } + } while (rq->outputsize + dctx.avail_out); + + if (dctx.kout) + kunmap_local(dctx.kout); +failed_zinit: + kunmap_local(dctx.kin); + /* 4. push back ZSTD stream context to the global list */ + spin_lock(&z_erofs_zstd_lock); + strm->next = z_erofs_zstd_head; + z_erofs_zstd_head = strm; + spin_unlock(&z_erofs_zstd_lock); + wake_up(&z_erofs_zstd_wq); + return reason; +} + +const struct z_erofs_decompressor z_erofs_zstd_decomp = { + .config = z_erofs_load_zstd_config, + .decompress = z_erofs_zstd_decompress, + .init = z_erofs_zstd_init, + .exit = z_erofs_zstd_exit, + .name = "zstd", +}; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index ecf28f66b97d..32b4f5aa60c9 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -6,36 +6,17 @@ */ #include "internal.h" -static void debug_one_dentry(unsigned char d_type, const char *de_name, - unsigned int de_namelen) -{ -#ifdef CONFIG_EROFS_FS_DEBUG - /* since the on-disk name could not have the trailing '\0' */ - unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; - - memcpy(dbg_namebuf, de_name, de_namelen); - dbg_namebuf[de_namelen] = '\0'; - - erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, - de_namelen, d_type); -#endif -} - static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, - unsigned int nameoff, unsigned int maxsize) + unsigned int nameoff0, unsigned int maxsize) { - const struct erofs_dirent *end = dentry_blk + nameoff; + const struct erofs_dirent *end = dentry_blk + nameoff0; while (de < end) { - const char *de_name; + unsigned char d_type = fs_ftype_to_dtype(de->file_type); + unsigned int nameoff = le16_to_cpu(de->nameoff); + const char *de_name = (char *)dentry_blk + nameoff; unsigned int de_namelen; - unsigned char d_type; - - d_type = fs_ftype_to_dtype(de->file_type); - - nameoff = le16_to_cpu(de->nameoff); - de_name = (char *)dentry_blk + nameoff; /* the last dirent in the block? */ if (de + 1 >= end) @@ -52,10 +33,9 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, return -EFSCORRUPTED; } - debug_one_dentry(d_type, de_name, de_namelen); if (!dir_emit(ctx, de_name, de_namelen, - le64_to_cpu(de->nid), d_type)) - /* stopped by some reason */ + erofs_nid_to_ino64(EROFS_SB(dir->i_sb), + le64_to_cpu(de->nid)), d_type)) return 1; ++de; ctx->pos += sizeof(struct erofs_dirent); @@ -67,58 +47,75 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - const size_t dirsize = i_size_read(dir); - unsigned int i = ctx->pos / EROFS_BLKSIZ; - unsigned int ofs = ctx->pos % EROFS_BLKSIZ; + struct super_block *sb = dir->i_sb; + struct file_ra_state *ra = &f->f_ra; + unsigned long bsz = sb->s_blocksize; + unsigned int ofs = erofs_blkoff(sb, ctx->pos); + pgoff_t ra_pages = DIV_ROUND_UP_POW2( + EROFS_I_SB(dir)->dir_ra_bytes, PAGE_SIZE); + pgoff_t nr_pages = DIV_ROUND_UP_POW2(dir->i_size, PAGE_SIZE); int err = 0; bool initial = true; - while (ctx->pos < dirsize) { + buf.mapping = dir->i_mapping; + while (ctx->pos < dir->i_size) { + erofs_off_t dbstart = ctx->pos - ofs; struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dir, i, EROFS_KMAP); + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + /* readahead blocks to enhance performance for large directories */ + if (ra_pages) { + pgoff_t idx = DIV_ROUND_UP_POW2(ctx->pos, PAGE_SIZE); + pgoff_t pages = min(nr_pages - idx, ra_pages); + + if (pages > 1 && !ra_has_index(ra, idx)) + page_cache_sync_readahead(dir->i_mapping, ra, + f, idx, pages); + } + + de = erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { - erofs_err(dir->i_sb, - "fail to readdir of logical block %u of nid %llu", - i, EROFS_I(dir)->nid); + erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", + erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; } nameoff = le16_to_cpu(de->nameoff); - if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= EROFS_BLKSIZ) { - erofs_err(dir->i_sb, - "invalid de[0].nameoff %u @ nid %llu", + if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) { + erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; break; } - maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, EROFS_BLKSIZ); - + maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; - ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = blknr_to_addr(i) + ofs; - if (ofs >= nameoff) - goto skip_this; + ctx->pos = dbstart + ofs; } err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); if (err) break; -skip_this: - ctx->pos = blknr_to_addr(i) + maxsize; - ++i; + ctx->pos = dbstart + maxsize; ofs = 0; + cond_resched(); } erofs_put_metabuf(&buf); + if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) { + if (!dir_emit_dot(f, ctx)) + return 0; + ++ctx->pos; + } return err < 0 ? err : 0; } @@ -126,4 +123,8 @@ const struct file_operations erofs_dir_fops = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = erofs_readdir, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif }; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index dbcd24371002..e24268acdd62 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */ +/* SPDX-License-Identifier: MIT */ /* * EROFS (Enhanced ROM File System) on-disk format definition * @@ -9,10 +9,15 @@ #ifndef __EROFS_FS_H #define __EROFS_FS_H +/* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 -#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 -#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 +#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -27,41 +32,39 @@ #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 +#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 +#define EROFS_FEATURE_INCOMPAT_METABOX 0x00000100 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ - EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE) + ((EROFS_FEATURE_INCOMPAT_METABOX << 1) - 1) #define EROFS_SB_EXTSLOT_SIZE 16 struct erofs_deviceslot { u8 tag[64]; /* digest(sha256), etc. */ - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ - u8 reserved[56]; + __le32 blocks_lo; /* total blocks count of this device */ + __le32 uniaddr_lo; /* unified starting block of this device */ + __le32 blocks_hi; /* total blocks count MSB */ + __le16 uniaddr_hi; /* unified starting block MSB */ + u8 reserved[50]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) -/* erofs on-disk super block (currently 128 bytes) */ +/* erofs on-disk super block (currently 144 bytes at maximum) */ struct erofs_super_block { __le32 magic; /* file system magic number */ - __le32 checksum; /* crc32c(super_block) */ + __le32 checksum; /* crc32c to avoid unexpected on-disk overlap */ __le32 feature_compat; - __u8 blkszbits; /* support block_size == PAGE_SIZE only */ + __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ - - __le16 root_nid; /* nid of root directory */ + union { + __le16 rootnid_2b; /* nid of root directory */ + __le16 blocks_hi; /* (48BIT on) blocks count MSB */ + } __packed rb; __le64 inos; /* total valid ino # (== f_files - f_favail) */ - - __le64 build_time; /* compact inode time derivation */ - __le32 build_time_nsec; /* compact inode time derivation in ns scale */ - __le32 blocks; /* used for statfs */ + __le64 epoch; /* base seconds used for compact inodes */ + __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + __le32 blocks_lo; /* blocks count LSB */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ __u8 uuid[16]; /* 128-bit uuid for volume */ @@ -75,121 +78,110 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved[6]; + __u8 dirblkbits; /* directory block size in bit shift */ + __u8 xattr_prefix_count; /* # of long xattr name prefixes */ + __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ - __u8 reserved2[24]; + __u8 xattr_filter_reserved; /* reserved for xattr name filter */ + __u8 reserved[3]; + __le32 build_time; /* seconds added to epoch for mkfs time */ + __le64 rootnid_8b; /* (48BIT on) nid of root directory */ + __le64 reserved2; + __le64 metabox_nid; /* (METABOX on) nid of the metabox inode */ + __le64 reserved3; /* [align to extslot 1] */ }; /* - * erofs inode datalayout (i_format in on-disk inode): + * EROFS inode datalayout (i_format in on-disk inode): * 0 - uncompressed flat inode without tail-packing inline data: - * inode, [xattrs], ... | ... | no-holed data * 1 - compressed inode with non-compact indexes: - * inode, [xattrs], [map_header], extents ... | ... * 2 - uncompressed flat inode with tail-packing inline data: - * inode, [xattrs], tailpacking data, ... | ... | no-holed data * 3 - compressed inode with compact indexes: - * inode, [xattrs], map_header, extents ... | ... * 4 - chunk-based inode with (optional) multi-device support: - * inode, [xattrs], chunk indexes ... | ... * 5~7 - reserved */ enum { EROFS_INODE_FLAT_PLAIN = 0, - EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, + EROFS_INODE_COMPRESSED_FULL = 1, EROFS_INODE_FLAT_INLINE = 2, - EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_COMPRESSED_COMPACT = 3, EROFS_INODE_CHUNK_BASED = 4, EROFS_INODE_DATALAYOUT_MAX }; static inline bool erofs_inode_is_data_compressed(unsigned int datamode) { - return datamode == EROFS_INODE_FLAT_COMPRESSION || - datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; + return datamode == EROFS_INODE_COMPRESSED_COMPACT || + datamode == EROFS_INODE_COMPRESSED_FULL; } /* bit definitions of inode i_format */ -#define EROFS_I_VERSION_BITS 1 -#define EROFS_I_DATALAYOUT_BITS 3 - -#define EROFS_I_VERSION_BIT 0 -#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_VERSION_MASK 0x01 +#define EROFS_I_DATALAYOUT_MASK 0x07 -#define EROFS_I_ALL \ - ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */ +#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F -/* with chunk indexes or just a 4-byte blkaddr array */ +/* with chunk indexes or just a 4-byte block array */ #define EROFS_CHUNK_FORMAT_INDEXES 0x0020 +#define EROFS_CHUNK_FORMAT_48BIT 0x0040 -#define EROFS_CHUNK_FORMAT_ALL \ - (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1) + +/* 32-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_COMPACT 0 +/* 64-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_EXTENDED 1 struct erofs_inode_chunk_info { __le16 format; /* chunk blkbits, etc. */ __le16 reserved; }; +union erofs_inode_i_u { + __le32 blocks_lo; /* total blocks count (if compressed inodes) */ + __le32 startblk_lo; /* starting block number (if flat inodes) */ + __le32 rdev; /* device ID (if special inodes) */ + struct erofs_inode_chunk_info c; +}; + +union erofs_inode_i_nb { + __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ + __le16 blocks_hi; /* total blocks count MSB */ + __le16 startblk_hi; /* starting block number MSB */ +} __packed; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_nlink; + union erofs_inode_i_nb i_nb; __le32 i_size; - __le32 i_reserved; - union { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ - struct erofs_inode_chunk_info c; - } i_u; - __le32 i_ino; /* only used for 32-bit stat compatibility */ + __le32 i_mtime; + union erofs_inode_i_u i_u; + + __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; - __le32 i_reserved2; + __le32 i_reserved; }; -/* 32-byte on-disk inode */ -#define EROFS_INODE_LAYOUT_COMPACT 0 -/* 64-byte on-disk inode */ -#define EROFS_INODE_LAYOUT_EXTENDED 1 - /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_reserved; + union erofs_inode_i_nb i_nb; __le64 i_size; - union { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ - struct erofs_inode_chunk_info c; - } i_u; - - /* only used for 32-bit stat compatibility */ - __le32 i_ino; + union erofs_inode_i_u i_u; + __le32 i_ino; /* only used for 32-bit stat compatibility */ __le32 i_uid; __le32 i_gid; __le64 i_mtime; @@ -198,10 +190,6 @@ struct erofs_inode_extended { __u8 i_reserved2[16]; }; -#define EROFS_MAX_SHARED_XATTRS (128) -/* h_shared_count between 129 ... 255 are special # */ -#define EROFS_SHARED_XATTR_EXTENT (255) - /* * inline xattrs (n == i_xattr_icount): * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes @@ -214,7 +202,7 @@ struct erofs_inode_extended { * for read-only fs, no need to introduce h_refcount */ struct erofs_xattr_ibody_header { - __le32 h_reserved; + __le32 h_name_filter; /* bit value 1 indicates not-present */ __u8 h_shared_count; __u8 h_reserved2[7]; __le32 h_shared_xattrs[]; /* shared xattr id array */ @@ -228,6 +216,17 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_INDEX_LUSTRE 5 #define EROFS_XATTR_INDEX_SECURITY 6 +/* + * bit 7 of e_name_index is set when it refers to a long xattr name prefix, + * while the remained lower bits represent the index of the prefix. + */ +#define EROFS_XATTR_LONG_PREFIX 0x80 +#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f + +#define EROFS_XATTR_FILTER_BITS 32 +#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX +#define EROFS_XATTR_FILTER_SEED 0x25BBE08F + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ @@ -237,11 +236,18 @@ struct erofs_xattr_entry { char e_name[]; /* attribute name */ }; +/* long xattr name prefix */ +struct erofs_xattr_long_prefix { + __u8 base_index; /* short xattr name prefix index */ + char infix[]; /* infix apart from short prefix */ +}; + static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) { if (!i_xattr_icount) return 0; + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ return sizeof(struct erofs_xattr_ibody_header) + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); } @@ -260,20 +266,44 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 4-byte block address array */ #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) -/* 8-byte inode chunk indexes */ +/* 8-byte inode chunk index */ struct erofs_inode_chunk_index { - __le16 advise; /* always 0, don't care for now */ + __le16 startblk_hi; /* starting block number MSB */ __le16 device_id; /* back-end storage id (with bits masked) */ - __le32 blkaddr; /* start block address of this inode chunk */ + __le32 startblk_lo; /* starting block number of this chunk */ }; -/* maximum supported size of a physical compression cluster */ +#define EROFS_DIRENT_NID_METABOX_BIT 63 +#define EROFS_DIRENT_NID_MASK (BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT) - 1) + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* node number */ + __le16 nameoff; /* start offset of file name */ + __u8 file_type; /* file type */ + __u8 reserved; /* reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + +/* maximum supported encoded size of a physical compressed cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +/* maximum supported decoded size of a physical compressed cluster */ +#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, + Z_EROFS_COMPRESSION_DEFLATE = 2, + Z_EROFS_COMPRESSION_ZSTD = 3, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -294,22 +324,36 @@ struct z_erofs_lzma_cfgs { #define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_deflate_cfgs { + u8 windowbits; /* 8..15 for DEFLATE */ + u8 reserved[5]; +} __packed; + +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_zstd_cfgs { + u8 format; + u8 windowlog; /* windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN(10) */ + u8 reserved[4]; +} __packed; + +#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE + /* - * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) - * e.g. for 4k logical cluster size, 4B if compacted 2B is off; - * (4B) + 2B + (4B) if compacted 2B is on. - * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) - * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) - * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) - * bit 4 : interlaced plain pcluster (0 - off; 1 - on) - * bit 5 : fragment pcluster (0 - off; 1 - on) + * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes: + * 4B (disabled) vs 4B+2B+4B (enabled) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */ +#define Z_EROFS_ADVISE_EXTENTS 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 #define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +/* Indicate the record size for each extent if extent metadata is used */ +#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1 +#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3 #define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { @@ -321,104 +365,76 @@ struct z_erofs_map_header { /* indicates the encoded size of tailpacking data */ __le16 h_idata_size; }; + __le32 h_extents_lo; /* extent count LSB */ }; __le16 h_advise; - /* - * bit 0-3 : algorithm type of head 1 (logical cluster type 01); - * bit 4-7 : algorithm type of head 2 (logical cluster type 11). - */ - __u8 h_algorithmtype; - /* - * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-6 : reserved; - * bit 7 : move the whole file into packed inode or not. - */ - __u8 h_clusterbits; + union { + struct { + /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */ + __u8 h_algorithmtype; + /* + * bit 0-3 : logical cluster bits - blkszbits + * bit 4-6 : reserved + * bit 7 : pack the whole file into packed inode + */ + __u8 h_clusterbits; + } __packed; + __le16 h_extents_hi; /* extent count MSB */ + } __packed; }; -#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 - -/* - * Fixed-sized output compression on-disk logical cluster type: - * 0 - literal (uncompressed) lcluster - * 1,3 - compressed lcluster (for HEAD lclusters) - * 2 - compressed lcluster (for NONHEAD lclusters) - * - * In detail, - * 0 - literal (uncompressed) lcluster, - * di_advise = 0 - * di_clusterofs = the literal data offset of the lcluster - * di_blkaddr = the blkaddr of the literal pcluster - * - * 1,3 - compressed lcluster (for HEAD lclusters) - * di_advise = 1 or 3 - * di_clusterofs = the decompressed data offset of the lcluster - * di_blkaddr = the blkaddr of the compressed pcluster - * - * 2 - compressed lcluster (for NONHEAD lclusters) - * di_advise = 2 - * di_clusterofs = - * the decompressed data offset in its own HEAD lcluster - * di_u.delta[0] = distance to this HEAD lcluster - * di_u.delta[1] = distance to the next HEAD lcluster - */ enum { - Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, - Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, - Z_EROFS_VLE_CLUSTER_TYPE_MAX + Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, + Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, + Z_EROFS_LCLUSTER_TYPE_NONHEAD = 2, + Z_EROFS_LCLUSTER_TYPE_HEAD2 = 3, + Z_EROFS_LCLUSTER_TYPE_MAX }; -#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 -#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +#define Z_EROFS_LI_LCLUSTER_TYPE_MASK (Z_EROFS_LCLUSTER_TYPE_MAX - 1) /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ -#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) +#define Z_EROFS_LI_PARTIAL_REF (1 << 15) -/* - * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the - * compressed block count of a compressed extent (in logical clusters, aka. - * block count of a pcluster). - */ -#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11) +/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */ +#define Z_EROFS_LI_D0_CBLKCNT (1 << 11) -struct z_erofs_vle_decompressed_index { +struct z_erofs_lcluster_index { __le16 di_advise; /* where to decompress in the head lcluster */ __le16 di_clusterofs; union { - /* for the HEAD lclusters */ - __le32 blkaddr; + __le32 blkaddr; /* for the HEAD lclusters */ /* - * for the NONHEAD lclusters * [0] - distance to its HEAD lcluster * [1] - distance to the next HEAD lcluster */ - __le16 delta[2]; + __le16 delta[2]; /* for the NONHEAD lclusters */ } di_u; }; -#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ - (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ - sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) - -/* dirent sorts in alphabet order, thus we can do binary search */ -struct erofs_dirent { - __le64 nid; /* node number */ - __le16 nameoff; /* start offset of file name */ - __u8 file_type; /* file type */ - __u8 reserved; /* reserved */ -} __packed; - -/* - * EROFS file types should match generic FT_* types and - * it seems no need to add BUILD_BUG_ONs since potential - * unmatchness will break other fses as well... - */ +#define Z_EROFS_MAP_HEADER_END(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header)) +#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8) + +#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27) +#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28 +#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1) +struct z_erofs_extent { + __le32 plen; /* encoded length */ + __le32 pstart_lo; /* physical offset */ + __le32 pstart_hi; /* physical offset MSB */ + __le32 lstart_lo; /* logical offset */ + __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */ + __u8 reserved[12]; /* for future use */ +}; -#define EROFS_NAME_LEN 255 +static inline int z_erofs_extent_recsize(unsigned int advise) +{ + return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) & + Z_EROFS_ADVISE_EXTRECSZ_MASK); +} /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) @@ -427,7 +443,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT }; - BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 144); BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); @@ -435,15 +451,13 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); - BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_lcluster_index) != 8); BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != - sizeof(struct z_erofs_vle_decompressed_index)); + sizeof(struct z_erofs_lcluster_index)); BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); - BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < - Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); /* exclude old compiler versions like gcc 7.5.0 */ BUILD_BUG_ON(__builtin_constant_p(fmh) ? fmh != cpu_to_le64(1ULL << 63) : 0); diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 000000000000..932e8b353ba1 --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include <trace/events/erofs.h> + +struct erofs_fileio_rq { + struct bio_vec bvecs[16]; + struct bio bio; + struct kiocb iocb; + struct super_block *sb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct folio_iter fi; + + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + if (rq->bio.bi_end_io) { + if (ret < 0 && !rq->bio.bi_status) + rq->bio.bi_status = errno_to_blk_status(ret); + } else { + bio_for_each_folio_all(fi, &rq->bio) { + DBG_BUGON(folio_test_uptodate(fi.folio)); + erofs_onlinefolio_end(fi.folio, ret, false); + } + } + bio_endio(&rq->bio); + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) + rq->iocb.ki_flags = IOCB_DIRECT; + iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + scoped_with_creds(rq->iocb.ki_filp->f_cred) + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ); + rq->iocb.ki_filp = mdev->m_dif->file; + rq->sb = mdev->m_sb; + return rq; +} + +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) +{ + return &erofs_fileio_rq_alloc(mdev)->bio; +} + +void erofs_fileio_submit_bio(struct bio *bio) +{ + return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq, + bio)); +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) +{ + struct inode *inode = folio_inode(folio); + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = folio_size(folio), len, attached = 0; + loff_t pos = folio_pos(folio), ofs; + int err = 0; + + erofs_onlinefolio_init(folio); + while (cur < end) { + if (!in_range(pos + cur, map->m_la, map->m_llen)) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = folio_pos(folio) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + map->m_pa + ofs, erofs_inode_in_metabox(inode)); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + memcpy_to_folio(folio, cur, src, len); + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq->bio.bi_iter.bi_sector = + (io->dev.m_dif->fsoff + io->dev.m_pa) >> 9; + attached = 0; + } + if (!bio_add_folio(&io->rq->bio, folio, len, cur)) + goto io_retry; + if (!attached++) + erofs_onlinefolio_split(folio); + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinefolio_end(folio, err, false); + return err; +} + +static int erofs_fileio_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_read_folio(folio, true); + err = erofs_fileio_scan_folio(&io, folio); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct folio *folio; + int err; + + trace_erofs_readahead(inode, readahead_index(rac), + readahead_count(rac), true); + while ((folio = readahead_folio(rac))) { + err = erofs_fileio_scan_folio(&io, folio); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .read_folio = erofs_fileio_read_folio, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 014e20962376..7a346e20f7b7 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -3,17 +3,37 @@ * Copyright (C) 2022, Alibaba Cloud * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ +#include <linux/pseudo_fs.h> #include <linux/fscache.h> #include "internal.h" static DEFINE_MUTEX(erofs_domain_list_lock); static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); +static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; -struct erofs_fscache_request { - struct erofs_fscache_request *primary; - struct netfs_cache_resources cache_resources; +static int erofs_anon_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type erofs_anon_fs_type = { + .owner = THIS_MODULE, + .name = "pseudo_erofs", + .init_fs_context = erofs_anon_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct erofs_fscache_io { + struct netfs_cache_resources cres; + struct iov_iter iter; + netfs_io_terminated_t end_io; + void *private; + refcount_t ref; +}; + +struct erofs_fscache_rq { struct address_space *mapping; /* The mapping being accessed */ loff_t start; /* Start position */ size_t len; /* Length of the request */ @@ -22,44 +42,17 @@ struct erofs_fscache_request { refcount_t ref; }; -static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, - loff_t start, size_t len) +static bool erofs_fscache_io_put(struct erofs_fscache_io *io) { - struct erofs_fscache_request *req; - - req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - req->mapping = mapping; - req->start = start; - req->len = len; - refcount_set(&req->ref, 1); - - return req; -} - -static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, - size_t len) -{ - struct erofs_fscache_request *req; - - /* use primary request for the first submission */ - if (!primary->submitted) { - refcount_inc(&primary->ref); - return primary; - } - - req = erofs_fscache_req_alloc(primary->mapping, - primary->start + primary->submitted, len); - if (!IS_ERR(req)) { - req->primary = primary; - refcount_inc(&primary->ref); - } - return req; + if (!refcount_dec_and_test(&io->ref)) + return false; + if (io->cres.ops) + io->cres.ops->end_operation(&io->cres); + kfree(io); + return true; } -static void erofs_fscache_req_complete(struct erofs_fscache_request *req) +static void erofs_fscache_req_complete(struct erofs_fscache_rq *req) { struct folio *folio; bool failed = req->error; @@ -79,169 +72,231 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req) rcu_read_unlock(); } -static void erofs_fscache_req_put(struct erofs_fscache_request *req) +static void erofs_fscache_req_put(struct erofs_fscache_rq *req) { - if (refcount_dec_and_test(&req->ref)) { - if (req->cache_resources.ops) - req->cache_resources.ops->end_operation(&req->cache_resources); - if (!req->primary) - erofs_fscache_req_complete(req); - else - erofs_fscache_req_put(req->primary); - kfree(req); - } + if (!refcount_dec_and_test(&req->ref)) + return; + erofs_fscache_req_complete(req); + kfree(req); } -static void erofs_fscache_subreq_complete(void *priv, - ssize_t transferred_or_error, bool was_async) +static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping, + loff_t start, size_t len) { - struct erofs_fscache_request *req = priv; + struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL); - if (IS_ERR_VALUE(transferred_or_error)) { - if (req->primary) - req->primary->error = transferred_or_error; - else - req->error = transferred_or_error; - } - erofs_fscache_req_put(req); + if (!req) + return NULL; + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); + return req; +} + +static void erofs_fscache_req_io_put(struct erofs_fscache_io *io) +{ + struct erofs_fscache_rq *req = io->private; + + if (erofs_fscache_io_put(io)) + erofs_fscache_req_put(req); +} + +static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error) +{ + struct erofs_fscache_io *io = priv; + struct erofs_fscache_rq *req = io->private; + + if (IS_ERR_VALUE(transferred_or_error)) + req->error = transferred_or_error; + erofs_fscache_req_io_put(io); +} + +static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req) +{ + struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL); + + if (!io) + return NULL; + io->end_io = erofs_fscache_req_end_io; + io->private = req; + refcount_inc(&req->ref); + refcount_set(&io->ref, 1); + return io; } /* - * Read data from fscache (cookie, pstart, len), and fill the read data into - * page cache described by (req->mapping, lstart, len). @pstart describeis the - * start physical address in the cache file. + * Read data from fscache described by cookie at pstart physical address + * offset, and fill the read data into buffer described by io->iter. */ -static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct erofs_fscache_request *req, loff_t pstart, size_t len) +static int erofs_fscache_read_io_async(struct fscache_cookie *cookie, + loff_t pstart, struct erofs_fscache_io *io) { enum netfs_io_source source; - struct super_block *sb = req->mapping->host->i_sb; - struct netfs_cache_resources *cres = &req->cache_resources; - struct iov_iter iter; - loff_t lstart = req->start + req->submitted; - size_t done = 0; + struct netfs_cache_resources *cres = &io->cres; + struct iov_iter *iter = &io->iter; int ret; - DBG_BUGON(len > req->len - req->submitted); - ret = fscache_begin_read_operation(cres, cookie); if (ret) return ret; - while (done < len) { - loff_t sstart = pstart + done; - size_t slen = len - done; + while (iov_iter_count(iter)) { + size_t orig_count = iov_iter_count(iter), len = orig_count; unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; source = cres->ops->prepare_ondemand_read(cres, - sstart, &slen, LLONG_MAX, &flags, 0); - if (WARN_ON(slen == 0)) + pstart, &len, LLONG_MAX, &flags, 0); + if (WARN_ON(len == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + erofs_err(NULL, "prepare_ondemand_read failed (source %d)", source); return -EIO; } - refcount_inc(&req->ref); - iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages, - lstart + done, slen); - - ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, - erofs_fscache_subreq_complete, req); + iov_iter_truncate(iter, len); + refcount_inc(&io->ref); + ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL, + io->end_io, io); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { - erofs_err(sb, "failed to fscache_read (ret %d)", ret); + erofs_err(NULL, "fscache_read failed (ret %d)", ret); return ret; } + if (WARN_ON(iov_iter_count(iter))) + return -EIO; - done += slen; + iov_iter_reexpand(iter, orig_count - len); + pstart += len; } - DBG_BUGON(done != len); return 0; } -static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +struct erofs_fscache_bio { + struct erofs_fscache_io io; + struct bio bio; /* w/o bdev to share bio_add_page/endio() */ + struct bio_vec bvecs[BIO_MAX_VECS]; +}; + +static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error) +{ + struct erofs_fscache_bio *io = priv; + + if (IS_ERR_VALUE(transferred_or_error)) + io->bio.bi_status = errno_to_blk_status(transferred_or_error); + bio_endio(&io->bio); + BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0); + erofs_fscache_io_put(&io->io); +} + +struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { + struct erofs_fscache_bio *io; + + io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); + bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); + io->io.private = mdev->m_dif->fscache->cookie; + io->io.end_io = erofs_fscache_bio_endio; + refcount_set(&io->io.ref, 1); + return &io->bio; +} + +void erofs_fscache_submit_bio(struct bio *bio) +{ + struct erofs_fscache_bio *io = container_of(bio, + struct erofs_fscache_bio, bio); int ret; - struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct erofs_fscache_request *req; - struct erofs_map_dev mdev = { - .m_deviceid = 0, - .m_pa = folio_pos(folio), - }; - ret = erofs_map_dev(sb, &mdev); - if (ret) { + iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt, + bio->bi_iter.bi_size); + ret = erofs_fscache_read_io_async(io->io.private, + bio->bi_iter.bi_sector << 9, &io->io); + erofs_fscache_io_put(&io->io); + if (!ret) + return; + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); +} + +static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) +{ + struct erofs_fscache *ctx = folio->mapping->host->i_private; + int ret = -ENOMEM; + struct erofs_fscache_rq *req; + struct erofs_fscache_io *io; + + req = erofs_fscache_req_alloc(folio->mapping, + folio_pos(folio), folio_size(folio)); + if (!req) { folio_unlock(folio); return ret; } - req = erofs_fscache_req_alloc(folio_mapping(folio), - folio_pos(folio), folio_size(folio)); - if (IS_ERR(req)) { - folio_unlock(folio); - return PTR_ERR(req); + io = erofs_fscache_req_io_alloc(req); + if (!io) { + req->error = ret; + goto out; } + iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages, + folio_pos(folio), folio_size(folio)); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa, folio_size(folio)); + ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io); if (ret) req->error = ret; + erofs_fscache_req_io_put(io); +out: erofs_fscache_req_put(req); return ret; } -static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) +static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) { - struct address_space *mapping = primary->mapping; + struct address_space *mapping = req->mapping; struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct erofs_fscache_request *req; + struct erofs_fscache_io *io; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct iov_iter iter; - loff_t pos = primary->start + primary->submitted; + loff_t pos = req->start + req->submitted; size_t count; int ret; map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret) return ret; if (map.m_flags & EROFS_MAP_META) { struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - erofs_blk_t blknr; - size_t offset, size; + struct iov_iter iter; + size_t size = map.m_llen; void *src; - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map.m_pa); - blknr = erofs_blknr(map.m_pa); - size = map.m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + src = erofs_read_metabuf(&buf, sb, map.m_pa, + erofs_inode_in_metabox(inode)); if (IS_ERR(src)) return PTR_ERR(src); iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); - if (copy_to_iter(src + offset, size, &iter) != size) { + if (copy_to_iter(src, size, &iter) != size) { erofs_put_metabuf(&buf); return -EFAULT; } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); - primary->submitted += PAGE_SIZE; + req->submitted += PAGE_SIZE; return 0; } - count = primary->len - primary->submitted; + count = req->len - req->submitted; if (!(map.m_flags & EROFS_MAP_MAPPED)) { + struct iov_iter iter; + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); - primary->submitted += count; + req->submitted += count; return 0; } @@ -256,18 +311,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) if (ret) return ret; - req = erofs_fscache_req_chain(primary, count); - if (IS_ERR(req)) - return PTR_ERR(req); + io = erofs_fscache_req_io_alloc(req); + if (!io) + return -ENOMEM; + iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); + ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie, + mdev.m_pa + (pos - map.m_la), io); + erofs_fscache_req_io_put(io); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa + (pos - map.m_la), count); - erofs_fscache_req_put(req); - primary->submitted += count; + req->submitted += count; return ret; } -static int erofs_fscache_data_read(struct erofs_fscache_request *req) +static int erofs_fscache_data_read(struct erofs_fscache_rq *req) { int ret; @@ -276,20 +332,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req) if (ret) req->error = ret; } while (!ret && req->submitted < req->len); - return ret; } static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - struct erofs_fscache_request *req; + struct erofs_fscache_rq *req; int ret; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); - if (IS_ERR(req)) { + if (!req) { folio_unlock(folio); - return PTR_ERR(req); + return -ENOMEM; } ret = erofs_fscache_data_read(req); @@ -299,14 +354,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) static void erofs_fscache_readahead(struct readahead_control *rac) { - struct erofs_fscache_request *req; + struct erofs_fscache_rq *req; if (!readahead_count(rac)) return; req = erofs_fscache_req_alloc(rac->mapping, readahead_pos(rac), readahead_length(rac)); - if (IS_ERR(req)) + if (!req) return; /* The request completion will drop refs on the folios. */ @@ -328,8 +383,6 @@ const struct address_space_operations erofs_fscache_access_aops = { static void erofs_fscache_domain_put(struct erofs_domain *domain) { - if (!domain) - return; mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); @@ -337,8 +390,8 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) kern_unmount(erofs_pseudo_mnt); erofs_pseudo_mnt = NULL; } - mutex_unlock(&erofs_domain_list_lock); fscache_relinquish_volume(domain->volume, NULL, false); + mutex_unlock(&erofs_domain_list_lock); kfree(domain->domain_id); kfree(domain); return; @@ -392,11 +445,12 @@ static int erofs_fscache_init_domain(struct super_block *sb) goto out; if (!erofs_pseudo_mnt) { - erofs_pseudo_mnt = kern_mount(&erofs_fs_type); - if (IS_ERR(erofs_pseudo_mnt)) { - err = PTR_ERR(erofs_pseudo_mnt); + struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto out; } + erofs_pseudo_mnt = mnt; } domain->volume = sbi->volume; @@ -431,19 +485,21 @@ static int erofs_fscache_register_domain(struct super_block *sb) return err; } -static -struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; struct fscache_cookie *cookie; + struct super_block *isb; + struct inode *inode; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->node); + refcount_set(&ctx->ref, 1); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -452,32 +508,33 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, ret = -EINVAL; goto err; } - fscache_use_cookie(cookie, false); - ctx->cookie = cookie; - - if (flags & EROFS_REG_COOKIE_NEED_INODE) { - struct inode *const inode = new_inode(sb); - - if (!inode) { - erofs_err(sb, "failed to get anon inode for %s", name); - ret = -ENOMEM; - goto err_cookie; - } - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - ctx->inode = inode; + /* + * Allocate anonymous inode in global pseudo mount for shareable blobs, + * so that they are accessible among erofs fs instances. + */ + isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb; + inode = new_inode(isb); + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; } + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + inode->i_blkbits = EROFS_SB(sb)->blkszbits; + inode->i_private = ctx; + + ctx->cookie = cookie; + ctx->inode = inode; return ctx; err_cookie: - fscache_unuse_cookie(ctx->cookie, NULL, NULL); - fscache_relinquish_cookie(ctx->cookie, false); + fscache_unuse_cookie(cookie, NULL, NULL); + fscache_relinquish_cookie(cookie, false); err: kfree(ctx); return ERR_PTR(ret); @@ -492,13 +549,9 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) kfree(ctx); } -static -struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb, + char *name, unsigned int flags) { - int err; - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; @@ -508,55 +561,38 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, ctx->name = kstrdup(name, GFP_KERNEL); if (!ctx->name) { - err = -ENOMEM; - goto out; - } - - inode = new_inode(erofs_pseudo_mnt->mnt_sb); - if (!inode) { - err = -ENOMEM; - goto out; + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(-ENOMEM); } - ctx->domain = domain; - ctx->anon_inode = inode; - inode->i_private = ctx; refcount_inc(&domain->ref); + ctx->domain = domain; + list_add(&ctx->node, &erofs_domain_cookies_list); return ctx; -out: - erofs_fscache_relinquish_cookie(ctx); - return ERR_PTR(err); } -static -struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, unsigned int flags) { - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + flags |= EROFS_REG_COOKIE_SHARE; mutex_lock(&erofs_domain_cookies_lock); - spin_lock(&psb->s_inode_list_lock); - list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { - ctx = inode->i_private; - if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + list_for_each_entry(ctx, &erofs_domain_cookies_list, node) { + if (ctx->domain != domain || strcmp(ctx->name, name)) continue; if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { - igrab(inode); + refcount_inc(&ctx->ref); } else { erofs_err(sb, "%s already exists in domain %s", name, domain->domain_id); ctx = ERR_PTR(-EEXIST); } - spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } - spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, flags); + ctx = erofs_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } @@ -572,23 +608,22 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { - bool drop; - struct erofs_domain *domain; + struct erofs_domain *domain = NULL; if (!ctx) return; - domain = ctx->domain; - if (domain) { - mutex_lock(&erofs_domain_cookies_lock); - drop = atomic_read(&ctx->anon_inode->i_count) == 1; - iput(ctx->anon_inode); - mutex_unlock(&erofs_domain_cookies_lock); - if (!drop) - return; - } + if (!ctx->domain) + return erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_domain_put(domain); + mutex_lock(&erofs_domain_cookies_lock); + if (refcount_dec_and_test(&ctx->ref)) { + domain = ctx->domain; + list_del(&ctx->node); + erofs_fscache_relinquish_cookie(ctx); + } + mutex_unlock(&erofs_domain_cookies_lock); + if (domain) + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) @@ -596,7 +631,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; - unsigned int flags; + unsigned int flags = 0; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -615,14 +650,13 @@ int erofs_fscache_register_fs(struct super_block *sb) * * Acquired domain/volume will be relinquished in kill_sb() on error. */ - flags = EROFS_REG_COOKIE_NEED_INODE; if (sbi->domain_id) flags |= EROFS_REG_COOKIE_NEED_NOEXIST; fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); if (IS_ERR(fscache)) return PTR_ERR(fscache); - sbi->s_fscache = fscache; + sbi->dif0.fscache = fscache; return 0; } @@ -630,14 +664,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - erofs_fscache_unregister_cookie(sbi->s_fscache); + erofs_fscache_unregister_cookie(sbi->dif0.fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); - sbi->s_fscache = NULL; + sbi->dif0.fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d3b8736fa124..bce98c845a18 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,42 +5,54 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" - +#include <linux/compat.h> #include <trace/events/erofs.h> -static void *erofs_read_inode(struct erofs_buf *buf, - struct inode *inode, unsigned int *ofs) +static int erofs_fill_symlink(struct inode *inode, void *kaddr, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + loff_t off; + + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) + return 0; + + inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); + return inode->i_link ? 0 : -ENOMEM; +} + +static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; + erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode)); + unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode)); + bool in_mbox = erofs_inode_in_metabox(inode); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi = EROFS_SB(sb); + erofs_blk_t addrmask = BIT_ULL(48) - 1; struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = iloc(sbi, vi->nid); - - erofs_blk_t blkaddr, nblks = 0; - void *kaddr; + struct erofs_inode_extended *die, copied; struct erofs_inode_compact *dic; - struct erofs_inode_extended *die, *copied = NULL; unsigned int ifmt; - int err; - - blkaddr = erofs_blknr(inode_loc); - *ofs = erofs_blkoff(inode_loc); - - erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", - __func__, vi->nid, *ofs, blkaddr); + void *ptr; + int err = 0; - kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(kaddr)); - return kaddr; + ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), in_mbox); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to read inode meta block (nid: %llu): %d", + vi->nid, err); + goto err_out; } - dic = kaddr + *ofs; + dic = ptr + ofs; ifmt = le16_to_cpu(dic->i_format); - if (ifmt & ~EROFS_I_ALL) { - erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + erofs_err(sb, "unsupported i_format %u of nid %llu", ifmt, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -48,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { - erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + erofs_err(sb, "unsupported datalayout %u of nid %llu", vi->datalayout, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -58,212 +70,151 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) { - *ofs += vi->inode_isize; + if (ofs + vi->inode_isize <= sb->s_blocksize) { + ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; + copied.i_u = die->i_u; + copied.i_nb = die->i_nb; } else { - const unsigned int gotten = EROFS_BLKSIZ - *ofs; - - copied = kmalloc(vi->inode_isize, GFP_NOFS); - if (!copied) { - err = -ENOMEM; + const unsigned int gotten = sb->s_blocksize - ofs; + + memcpy(&copied, dic, gotten); + ptr = erofs_read_metabuf(&buf, sb, + erofs_pos(sb, blkaddr + 1), in_mbox); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to read inode payload block (nid: %llu): %d", + vi->nid, err); goto err_out; } - memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1, - EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", - vi->nid, PTR_ERR(kaddr)); - kfree(copied); - return kaddr; - } - *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, *ofs); - die = copied; + ofs = vi->inode_isize - gotten; + memcpy((u8 *)&copied + gotten, ptr, ofs); + die = &copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(die->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - - /* extended inode has its own timestamp */ - inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime); - inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec); + inode_set_mtime(inode, le64_to_cpu(die->i_mtime), + le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(die->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(die->i_u.c.format); - kfree(copied); - copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); - *ofs += vi->inode_isize; + ofs += vi->inode_isize; vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(dic->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + copied.i_u = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nlink)); - - /* use build time for compact inodes */ - inode->i_ctime.tv_sec = sbi->build_time; - inode->i_ctime.tv_nsec = sbi->build_time_nsec; + if (!S_ISDIR(inode->i_mode) && + ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) { + set_nlink(inode, 1); + copied.i_nb = dic->i_nb; + } else { + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); + copied.i_nb.startblk_hi = 0; + addrmask = BIT_ULL(32) - 1; + } + inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime), + sbi->fixed_nsec); inode->i_size = le32_to_cpu(dic->i_size); - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(dic->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: - erofs_err(inode->i_sb, - "unsupported on-disk inode version %u of nid %llu", + erofs_err(sb, "unsupported on-disk inode version %u of nid %llu", erofs_inode_version(ifmt), vi->nid); err = -EOPNOTSUPP; goto err_out; } + if (unlikely(inode->i_size < 0)) { + erofs_err(sb, "negative i_size @ nid %llu", vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1; + fallthrough; + case S_IFREG: + case S_IFLNK: + vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) | + ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32); + if (vi->datalayout == EROFS_INODE_FLAT_PLAIN && + !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask)) + vi->startblk = EROFS_NULL_ADDR; + + if(S_ISLNK(inode->i_mode)) { + err = erofs_fill_symlink(inode, ptr, ofs); + if (err) + goto err_out; + } + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode, + vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } + + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) << + (sb->s_blocksize_bits - 9); + else + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(copied.i_u.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { - erofs_err(inode->i_sb, - "unsupported chunk format %x of nid %llu", + erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); err = -EOPNOTSUPP; goto err_out; } - vi->chunkbits = LOG_BLOCK_SIZE + + vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; - inode->i_atime.tv_sec = inode->i_ctime.tv_sec; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode_set_atime_to_ts(inode, + inode_set_ctime_to_ts(inode, inode_get_mtime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && - vi->datalayout == EROFS_INODE_FLAT_PLAIN) + (vi->datalayout == EROFS_INODE_FLAT_PLAIN || + vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; - if (!nblks) - /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; - else - inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; - return kaddr; - -bogusimode: - erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", - inode->i_mode, vi->nid); - err = -EFSCORRUPTED; err_out: - DBG_BUGON(1); - kfree(copied); - erofs_put_metabuf(buf); - return ERR_PTR(err); -} - -static int erofs_fill_symlink(struct inode *inode, void *kaddr, - unsigned int m_pofs) -{ - struct erofs_inode *vi = EROFS_I(inode); - char *lnk; - - /* if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { - inode->i_op = &erofs_symlink_iops; - return 0; - } - - lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); - if (!lnk) - return -ENOMEM; - - m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross block boundary */ - if (m_pofs + inode->i_size > EROFS_BLKSIZ) { - kfree(lnk); - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - memcpy(lnk, kaddr + m_pofs, inode->i_size); - lnk[inode->i_size] = '\0'; - - inode->i_link = lnk; - inode->i_op = &erofs_fast_symlink_iops; - return 0; + erofs_put_metabuf(&buf); + return err; } static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - unsigned int ofs; - int err = 0; + int err; trace_erofs_fill_inode(inode); + err = erofs_read_inode(inode); + if (err) + return err; - /* read inode base data from disk */ - kaddr = erofs_read_inode(&buf, inode, &ofs); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); - - /* setup the new inode */ switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_fop = &generic_ro_fops; - else - inode->i_fop = &erofs_file_fops; + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -271,105 +222,157 @@ static int erofs_fill_inode(struct inode *inode) inode_nohighmem(inode); break; case S_IFLNK: - err = erofs_fill_symlink(inode, kaddr, ofs); - if (err) - goto out_unlock; + if (inode->i_link) + inode->i_op = &erofs_fast_symlink_iops; + else + inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: + default: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); - goto out_unlock; - default: - err = -EFSCORRUPTED; - goto out_unlock; + return 0; } - if (erofs_inode_is_data_compressed(vi->datalayout)) { - if (!erofs_is_fscache_mode(inode->i_sb)) - err = z_erofs_fill_inode(inode); - else - err = -EOPNOTSUPP; - goto out_unlock; - } - inode->i_mapping->a_ops = &erofs_raw_access_aops; mapping_set_large_folios(inode->i_mapping); + if (erofs_inode_is_data_compressed(vi->datalayout)) { +#ifdef CONFIG_EROFS_FS_ZIP + DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); + inode->i_mapping->a_ops = &z_erofs_aops; +#else + err = -EOPNOTSUPP; +#endif + } else { + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; #endif + } -out_unlock: - erofs_put_metabuf(&buf); return err; } /* - * erofs nid is 64bits, but i_ino is 'unsigned long', therefore - * we should do more for 32-bit platform to find the right inode. + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. */ -static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +static ino_t erofs_squash_ino(struct super_block *sb, erofs_nid_t nid) { - const erofs_nid_t nid = *(erofs_nid_t *)opaque; + u64 ino64 = erofs_nid_to_ino64(EROFS_SB(sb), nid); - return EROFS_I(inode)->nid == nid; + if (sizeof(ino_t) < sizeof(erofs_nid_t)) + ino64 ^= ino64 >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return (ino_t)ino64; } -static int erofs_iget_set_actor(struct inode *inode, void *opaque) +static int erofs_iget5_eq(struct inode *inode, void *opaque) +{ + return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque; +} + +static int erofs_iget5_set(struct inode *inode, void *opaque) { const erofs_nid_t nid = *(erofs_nid_t *)opaque; - inode->i_ino = erofs_inode_hash(nid); + inode->i_ino = erofs_squash_ino(inode->i_sb, nid); + EROFS_I(inode)->nid = nid; return 0; } struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { - const unsigned long hashval = erofs_inode_hash(nid); struct inode *inode; - inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, - erofs_iget_set_actor, &nid); + inode = iget5_locked(sb, erofs_squash_ino(sb, nid), erofs_iget5_eq, + erofs_iget5_set, &nid); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - int err; - struct erofs_inode *vi = EROFS_I(inode); + if (inode_state_read_once(inode) & I_NEW) { + int err = erofs_fill_inode(inode); - vi->nid = nid; - - err = erofs_fill_inode(inode); - if (!err) { - unlock_new_inode(inode); - } else { + if (err) { iget_failed(inode); - inode = ERR_PTR(err); + return ERR_PTR(err); } + unlock_new_inode(inode); } return inode; } -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); + struct block_device *bdev = inode->i_sb->s_bdev; + bool compressed = + erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout); - if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) + if (compressed) stat->attributes |= STATX_ATTR_COMPRESSED; - stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(mnt_userns, inode, stat); + /* + * Return the DIO alignment restrictions if requested. + * + * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode + * and uncompressed inodes, otherwise we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN; + if (bdev && !compressed) { + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } + } + generic_fillattr(idmap, request_mask, inode, stat); return 0; } +static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + int ret; + + if (!sbi->volume_name) + ret = clear_user(arg, 1); + else + ret = copy_to_user(arg, sbi->volume_name, + strlen(sbi->volume_name)); + return ret ? -EFAULT : 0; +} + +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFSLABEL: + return erofs_ioctl_get_volume_label(inode, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index bb8501c0ff5b..f7f622836198 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -8,34 +8,27 @@ #define __EROFS_INTERNAL_H #include <linux/fs.h> +#include <linux/dax.h> #include <linux/dcache.h> #include <linux/mm.h> +#include <linux/module.h> #include <linux/pagemap.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/iomap.h> #include "erofs_fs.h" -/* redefine pr_fmt "erofs: " */ -#undef pr_fmt -#define pr_fmt(fmt) "erofs: " fmt - -__printf(3, 4) void _erofs_err(struct super_block *sb, - const char *function, const char *fmt, ...); +__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); #define erofs_err(sb, fmt, ...) \ - _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) -__printf(3, 4) void _erofs_info(struct super_block *sb, - const char *function, const char *fmt, ...); + _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__) #define erofs_info(sb, fmt, ...) \ - _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) + _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__) + #ifdef CONFIG_EROFS_FS_DEBUG -#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__) #define DBG_BUGON BUG_ON #else -#define erofs_dbg(x, ...) ((void)0) #define DBG_BUGON(x) ((void)(x)) #endif /* !CONFIG_EROFS_FS_DEBUG */ @@ -44,18 +37,17 @@ __printf(3, 4) void _erofs_info(struct super_block *sb, typedef u64 erofs_nid_t; typedef u64 erofs_off_t; -/* data type for filesystem-wide blocks number */ -typedef u32 erofs_blk_t; +typedef u64 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct block_device *bdev; + struct file *file; struct dax_device *dax_dev; - u64 dax_part_off; + u64 fsoff, dax_part_off; - u32 blocks; - u32 mapped_blkaddr; + erofs_blk_t blocks; + erofs_blk_t uniaddr; }; enum { @@ -65,15 +57,12 @@ enum { }; struct erofs_mount_opts { -#ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ unsigned int sync_decompress; - /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; -#endif unsigned int mount_opt; }; @@ -82,13 +71,7 @@ struct erofs_dev_context { struct rw_semaphore rwsem; unsigned int extra_devices; -}; - -struct erofs_fs_context { - struct erofs_mount_opts opt; - struct erofs_dev_context *devs; - char *fsid; - char *domain_id; + bool flatdev; }; /* all filesystem-wide lz4 configurations */ @@ -108,13 +91,22 @@ struct erofs_domain { struct erofs_fscache { struct fscache_cookie *cookie; - struct inode *inode; - struct inode *anon_inode; + struct inode *inode; /* anonymous inode for the blob */ + + /* used for share domain mode */ struct erofs_domain *domain; + struct list_head node; + refcount_t ref; char *name; }; +struct erofs_xattr_prefix_item { + struct erofs_xattr_long_prefix *prefix; + u8 infix_len; +}; + struct erofs_sb_info { + struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -131,44 +123,47 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; - struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ + struct inode *packed_inode; + struct inode *metabox_inode; struct erofs_dev_context *devs; - struct dax_device *dax_dev; - u64 dax_part_off; u64 total_blocks; - u32 primarydevice_blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; + u32 xattr_prefix_start; + u8 xattr_prefix_count; + struct erofs_xattr_prefix_item *xattr_prefixes; + unsigned int xattr_filter_reserved; #endif u16 device_id_mask; /* valid bits of device id to be used */ - /* inode slot unit size in bit shift */ - unsigned char islotbits; + unsigned char islotbits; /* inode slot unit size in bit shift */ + unsigned char blkszbits; /* filesystem block size in bit shift */ u32 sb_size; /* total superblock size */ - u32 build_time_nsec; - u64 build_time; + u32 fixed_nsec; + s64 epoch; /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; + erofs_nid_t packed_nid; + erofs_nid_t metabox_nid; /* used for statfs, f_files - f_favail */ u64 inos; - u8 uuid[16]; /* 128-bit uuid for volume */ - u8 volume_name[16]; /* volume name */ + char *volume_name; u32 feature_compat; u32 feature_incompat; /* sysfs support */ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */ struct completion s_kobj_unregister; + erofs_off_t dir_ra_bytes; /* fscache support */ struct fscache_volume *volume; - struct erofs_fscache *s_fscache; struct erofs_domain *domain; char *fsid; char *domain_id; @@ -182,14 +177,21 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) #define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { - return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && + !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev; } enum { @@ -198,83 +200,19 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) - -/* basic unit of the workstation of a super_block */ -struct erofs_workgroup { - /* the workgroup index in the workstation */ - pgoff_t index; - - /* overall workgroup reference count */ - atomic_t refcount; -}; - -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, - int val) -{ - preempt_disable(); - if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { - preempt_enable(); - return false; - } - return true; -} - -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, - int orig_val) -{ - /* - * other observers should notice all modifications - * in the freezing period. - */ - smp_mb(); - atomic_set(&grp->refcount, orig_val); - preempt_enable(); -} - -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) -{ - return atomic_cond_read_relaxed(&grp->refcount, - VAL != EROFS_LOCKED_MAGIC); -} - -/* we strictly follow PAGE_SIZE and no buffer head yet */ -#define LOG_BLOCK_SIZE PAGE_SHIFT - -#undef LOG_SECTORS_PER_BLOCK -#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) - -#undef SECTORS_PER_BLOCK -#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) - -#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) - -#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) -#error erofs cannot be used in this platform -#endif - -enum erofs_kmap_type { - EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap_local_page() to map the buffer */ -}; - struct erofs_buf { + struct address_space *mapping; + struct file *file; + u64 off; struct page *page; void *base; - enum erofs_kmap_type kmap_type; }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define ROOT_NID(sb) ((sb)->root_nid) - -#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) -#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) -#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) - -static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) -{ - return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); -} +#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits)) +#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1)) +#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) +#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ @@ -291,7 +229,30 @@ EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) +EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) +EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT) +EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) +EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) +EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) +EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) + +static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + if (!erofs_sb_has_metabox(sbi)) + return nid; + + /* + * When metadata compression is enabled, avoid generating excessively + * large inode numbers for metadata-compressed inodes. Shift NIDs in + * the 31-62 bit range left by one and move the metabox flag to bit 31. + * + * Note: on-disk NIDs remain unchanged as they are primarily used for + * compatibility with non-LFS 32-bit applications. + */ + return ((nid << 1) & GENMASK_ULL(63, 32)) | (nid & GENMASK(30, 0)) | + ((nid >> EROFS_DIRENT_NID_METABOX_BIT) << 31); +} /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 @@ -301,6 +262,9 @@ EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) #define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1) #define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2) +/* default readahead size of directories */ +#define EROFS_DIR_RA_BYTES 16384 + struct erofs_inode { erofs_nid_t nid; @@ -309,13 +273,15 @@ struct erofs_inode { unsigned char datalayout; unsigned char inode_isize; - unsigned short xattr_isize; + bool dot_omitted; + unsigned int xattr_isize; + unsigned int xattr_name_filter; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; union { - erofs_blk_t raw_blkaddr; + erofs_blk_t startblk; struct { unsigned short chunkformat; unsigned char chunkbits; @@ -324,15 +290,13 @@ struct erofs_inode { struct { unsigned short z_advise; unsigned char z_algorithmtype[2]; - unsigned char z_logical_clusterbits; - unsigned long z_tailextent_headlcn; + unsigned char z_lclusterbits; union { - struct { - erofs_off_t z_idataoff; - unsigned short z_idata_size; - }; - erofs_off_t z_fragmentoff; + u64 z_tailextent_headlcn; + u64 z_extents; }; + erofs_off_t z_fragmentoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -340,73 +304,57 @@ struct erofs_inode { struct inode vfs_inode; }; -#define EROFS_I(ptr) \ - container_of(ptr, struct erofs_inode, vfs_inode) +#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) -static inline unsigned long erofs_inode_datablocks(struct inode *inode) +static inline bool erofs_inode_in_metabox(struct inode *inode) { - /* since i_size cannot be changed */ - return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + return EROFS_I(inode)->nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT); } -static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, - unsigned int bits) +static inline erofs_off_t erofs_iloc(struct inode *inode) { + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + erofs_nid_t nid_lo = EROFS_I(inode)->nid & EROFS_DIRENT_NID_MASK; - return (value >> bit) & ((1 << bits) - 1); + if (erofs_inode_in_metabox(inode)) + return nid_lo << sbi->islotbits; + return erofs_pos(inode->i_sb, sbi->meta_blkaddr) + + (nid_lo << sbi->islotbits); } - -static inline unsigned int erofs_inode_version(unsigned int value) +static inline unsigned int erofs_inode_version(unsigned int ifmt) { - return erofs_bitrange(value, EROFS_I_VERSION_BIT, - EROFS_I_VERSION_BITS); + return (ifmt >> EROFS_I_VERSION_BIT) & EROFS_I_VERSION_MASK; } -static inline unsigned int erofs_inode_datalayout(unsigned int value) +static inline unsigned int erofs_inode_datalayout(unsigned int ifmt) { - return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT, - EROFS_I_DATALAYOUT_BITS); + return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK; } -/* - * Different from grab_cache_page_nowait(), reclaiming is never triggered - * when allocating new pages. - */ -static inline -struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, - pgoff_t index) +/* reclaiming is never triggered when allocating new folios. */ +static inline struct folio *erofs_grab_folio_nowait(struct address_space *as, + pgoff_t index) { - return pagecache_get_page(mapping, index, + return __filemap_get_folio(as, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, - readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); + readahead_gfp_mask(as) & ~__GFP_RECLAIM); } -extern const struct super_operations erofs_sops; -extern struct file_system_type erofs_fs_type; - -extern const struct address_space_operations erofs_raw_access_aops; -extern const struct address_space_operations z_erofs_aops; - -enum { - BH_Encoded = BH_PrivateStart, - BH_FullMapped, - BH_Fragment, - BH_Partialref, -}; - /* Has a disk mapping */ -#define EROFS_MAP_MAPPED (1 << BH_Mapped) +#define EROFS_MAP_MAPPED 0x0001 /* Located in metadata (could be copied from bd_inode) */ -#define EROFS_MAP_META (1 << BH_Meta) +#define EROFS_MAP_META 0x0002 /* The extent is encoded */ -#define EROFS_MAP_ENCODED (1 << BH_Encoded) +#define EROFS_MAP_ENCODED 0x0004 /* The length of extent is full */ -#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +#define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +#define __EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ -#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) +#define EROFS_MAP_PARTIAL_REF 0x0020 + +#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT) struct erofs_map_blocks { struct erofs_buf buf; @@ -419,17 +367,15 @@ struct erofs_map_blocks { unsigned int m_flags; }; -/* Flags used by erofs_map_blocks_flatmode() */ -#define EROFS_GET_BLOCKS_RAW 0x0001 /* * Used to get the exact decompressed length, e.g. fiemap (consider lookback * approach instead if possible since it's more metadata lightweight.) */ -#define EROFS_GET_BLOCKS_FIEMAP 0x0002 +#define EROFS_GET_BLOCKS_FIEMAP 0x0001 /* Used to map the whole extent if non-negligible data is requested for LZMA */ -#define EROFS_GET_BLOCKS_READMORE 0x0004 +#define EROFS_GET_BLOCKS_READMORE 0x0002 /* Used to map tail extent for tailpacking inline or fragment pcluster */ -#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 +#define EROFS_GET_BLOCKS_FINDTAIL 0x0004 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -437,76 +383,59 @@ enum { Z_EROFS_COMPRESSION_RUNTIME_MAX }; -/* zmap.c */ -extern const struct iomap_ops z_erofs_iomap_report_ops; - -#ifdef CONFIG_EROFS_FS_ZIP -int z_erofs_fill_inode(struct inode *inode); -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags); -#else -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } -static inline int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) -{ - return -EOPNOTSUPP; -} -#endif /* !CONFIG_EROFS_FS_ZIP */ - struct erofs_map_dev { - struct erofs_fscache *m_fscache; + struct super_block *m_sb; + struct erofs_device_info *m_dif; struct block_device *m_bdev; - struct dax_device *m_daxdev; - u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; }; -/* data.c */ +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; +extern const struct address_space_operations z_erofs_aops; +extern const struct address_space_operations erofs_fscache_access_aops; + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; +extern const struct inode_operations erofs_dir_iops; + extern const struct file_operations erofs_file_fops; +extern const struct file_operations erofs_dir_fops; + +extern const struct iomap_ops z_erofs_iomap_report_ops; + +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_SHARE 0x0001 +#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 + +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap); +int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb, + bool in_metabox); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type); + erofs_off_t offset, bool in_metabox); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags); - -/* inode.c */ -static inline unsigned long erofs_inode_hash(erofs_nid_t nid) -{ -#if BITS_PER_LONG == 32 - return (nid >> 32) ^ (nid & 0xffffffff); -#else - return nid; -#endif -} - -extern const struct inode_operations erofs_generic_iops; -extern const struct inode_operations erofs_symlink_iops; -extern const struct inode_operations erofs_fast_symlink_iops; - +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); +void erofs_onlinefolio_init(struct folio *folio); +void erofs_onlinefolio_split(struct folio *folio); +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); -int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); - -/* namei.c */ -extern const struct inode_operations erofs_dir_iops; - int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type); -/* dir.c */ -extern const struct file_operations erofs_dir_fops; - static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { int retried = 0; @@ -522,23 +451,17 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) return NULL; } -/* pcpubuf.c */ -void *erofs_get_pcpubuf(unsigned int requiredpages); -void erofs_put_pcpubuf(void *ptr); -int erofs_pcpubuf_growsize(unsigned int nrpages); -void erofs_pcpubuf_init(void); -void erofs_pcpubuf_exit(void); - -/* sysfs.c */ int erofs_register_sysfs(struct super_block *sb); void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -/* utils.c / zdata.c */ -struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); -static inline void erofs_pagepool_add(struct page **pagepool, - struct page *page) +struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv); +static inline struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) +{ + return __erofs_allocpage(pagepool, gfp, false); +} +static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); *pagepool = page; @@ -546,78 +469,53 @@ static inline void erofs_pagepool_add(struct page **pagepool, void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -int erofs_workgroup_put(struct erofs_workgroup *grp); -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index); -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp); -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) + +extern atomic_long_t erofs_global_shrink_cnt; void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); -int __init z_erofs_init_zip_subsystem(void); -void z_erofs_exit_zip_subsystem(void); -int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, - struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); -int z_erofs_load_lz4_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lz4_cfgs *lz4, int len); +int __init z_erofs_init_subsystem(void); +void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink); +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags); +void *z_erofs_get_gbuf(unsigned int requiredpages); +void z_erofs_put_gbuf(void *ptr); +int z_erofs_gbuf_growsize(unsigned int nrpages); +int __init z_erofs_gbuf_init(void); +void z_erofs_gbuf_exit(void); +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} -static inline int z_erofs_init_zip_subsystem(void) { return 0; } -static inline void z_erofs_exit_zip_subsystem(void) {} -static inline int z_erofs_load_lz4_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lz4_cfgs *lz4, int len) -{ - if (lz4 || dsb->u1.lz4_max_distance) { - erofs_err(sb, "lz4 algorithm isn't enabled"); - return -EINVAL; - } - return 0; -} +static inline int z_erofs_init_subsystem(void) { return 0; } +static inline void z_erofs_exit_subsystem(void) {} +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ -#ifdef CONFIG_EROFS_FS_ZIP_LZMA -int z_erofs_lzma_init(void); -void z_erofs_lzma_exit(void); -int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lzma_cfgs *lzma, int size); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fileio_submit_bio(struct bio *bio); #else -static inline int z_erofs_lzma_init(void) { return 0; } -static inline int z_erofs_lzma_exit(void) { return 0; } -static inline int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lzma_cfgs *lzma, int size) { - if (lzma) { - erofs_err(sb, "lzma algorithm isn't enabled"); - return -EINVAL; - } - return 0; -} -#endif /* !CONFIG_EROFS_FS_ZIP */ - -/* flags for erofs_fscache_register_cookie() */ -#define EROFS_REG_COOKIE_NEED_INODE 1 -#define EROFS_REG_COOKIE_NEED_NOEXIST 2 +static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fileio_submit_bio(struct bio *bio) {} +#endif -/* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags); + char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); - -extern const struct address_space_operations erofs_fscache_access_aops; +struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fscache_submit_bio(struct bio *bio); #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -627,8 +525,7 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) + char *name, unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } @@ -636,8 +533,14 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } +static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index b64a108fac92..f7cf4f41af28 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -5,7 +5,6 @@ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" - #include <trace/events/erofs.h> struct erofs_qstr { @@ -87,28 +86,23 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, return ERR_PTR(-ENOENT); } -static void *find_target_block_classic(struct erofs_buf *target, - struct inode *dir, - struct erofs_qstr *name, - int *_ndirents) +static void *erofs_find_target_block(struct erofs_buf *target, + struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - unsigned int startprfx, endprfx; - int head, back; + unsigned int bsz = i_blocksize(dir); + int head = 0, back = erofs_iblks(dir) - 1; + unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); - startprfx = endprfx = 0; - head = 0; - back = erofs_inode_datablocks(dir) - 1; - while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - de = erofs_bread(&buf, dir, mid, EROFS_KMAP); + buf.mapping = dir->i_mapping; + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true); if (!IS_ERR(de)) { - const int nameoff = nameoff_from_disk(de->nameoff, - EROFS_BLKSIZ); + const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); int diff; unsigned int matched; @@ -128,33 +122,32 @@ static void *find_target_block_classic(struct erofs_buf *target, dname.name = (u8 *)de + nameoff; if (ndirents == 1) - dname.end = (u8 *)de + EROFS_BLKSIZ; + dname.end = (u8 *)de + bsz; else dname.end = (u8 *)de + - nameoff_from_disk(de[1].nameoff, - EROFS_BLKSIZ); + nameoff_from_disk(de[1].nameoff, bsz); /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); - if (!diff) { - *_ndirents = 0; - goto out; - } else if (diff > 0) { - head = mid + 1; - startprfx = matched; - - if (!IS_ERR(candidate)) - erofs_put_metabuf(target); - *target = buf; - candidate = de; - *_ndirents = ndirents; - } else { + if (diff < 0) { erofs_put_metabuf(&buf); - back = mid - 1; endprfx = matched; + continue; } + + if (!IS_ERR(candidate)) + erofs_put_metabuf(target); + *target = buf; + if (!diff) { + *_ndirents = 0; + return de; + } + head = mid + 1; + startprfx = matched; + candidate = de; + *_ndirents = ndirents; continue; } out: /* free if the candidate is valid */ @@ -178,15 +171,16 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.name = name->name; qn.end = name->name + name->len; + buf.mapping = dir->i_mapping; ndirents = 0; - - de = find_target_block_classic(&buf, dir, &qn, &ndirents); + de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); if (ndirents) - de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); + de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir), + ndirents); if (!IS_ERR(de)) { *nid = le64_to_cpu(de->nid); @@ -211,16 +205,13 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); - if (err == -ENOENT) { + if (err == -ENOENT) /* negative dentry */ inode = NULL; - } else if (err) { + else if (err) inode = ERR_PTR(err); - } else { - erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, - dentry, nid, d_type); + else inode = erofs_iget(dir->i_sb, nid); - } return d_splice_alias(inode, dentry); } diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c deleted file mode 100644 index a2efd833d1b6..000000000000 --- a/fs/erofs/pcpubuf.c +++ /dev/null @@ -1,148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) Gao Xiang <xiang@kernel.org> - * - * For low-latency decompression algorithms (e.g. lz4), reserve consecutive - * per-CPU virtual memory (in pages) in advance to store such inplace I/O - * data if inplace decompression is failed (due to unmet inplace margin for - * example). - */ -#include "internal.h" - -struct erofs_pcpubuf { - raw_spinlock_t lock; - void *ptr; - struct page **pages; - unsigned int nrpages; -}; - -static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); - -void *erofs_get_pcpubuf(unsigned int requiredpages) - __acquires(pcb->lock) -{ - struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); - - raw_spin_lock(&pcb->lock); - /* check if the per-CPU buffer is too small */ - if (requiredpages > pcb->nrpages) { - raw_spin_unlock(&pcb->lock); - put_cpu_var(erofs_pcb); - /* (for sparse checker) pretend pcb->lock is still taken */ - __acquire(pcb->lock); - return NULL; - } - return pcb->ptr; -} - -void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) -{ - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); - - DBG_BUGON(pcb->ptr != ptr); - raw_spin_unlock(&pcb->lock); - put_cpu_var(erofs_pcb); -} - -/* the next step: support per-CPU page buffers hotplug */ -int erofs_pcpubuf_growsize(unsigned int nrpages) -{ - static DEFINE_MUTEX(pcb_resize_mutex); - static unsigned int pcb_nrpages; - struct page *pagepool = NULL; - int delta, cpu, ret, i; - - mutex_lock(&pcb_resize_mutex); - delta = nrpages - pcb_nrpages; - ret = 0; - /* avoid shrinking pcpubuf, since no idea how many fses rely on */ - if (delta <= 0) - goto out; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - struct page **pages, **oldpages; - void *ptr, *old_ptr; - - pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - break; - } - - for (i = 0; i < nrpages; ++i) { - pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); - if (!pages[i]) { - ret = -ENOMEM; - oldpages = pages; - goto free_pagearray; - } - } - ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); - if (!ptr) { - ret = -ENOMEM; - oldpages = pages; - goto free_pagearray; - } - raw_spin_lock(&pcb->lock); - old_ptr = pcb->ptr; - pcb->ptr = ptr; - oldpages = pcb->pages; - pcb->pages = pages; - i = pcb->nrpages; - pcb->nrpages = nrpages; - raw_spin_unlock(&pcb->lock); - - if (!oldpages) { - DBG_BUGON(old_ptr); - continue; - } - - if (old_ptr) - vunmap(old_ptr); -free_pagearray: - while (i) - erofs_pagepool_add(&pagepool, oldpages[--i]); - kfree(oldpages); - if (ret) - break; - } - pcb_nrpages = nrpages; - erofs_release_pages(&pagepool); -out: - mutex_unlock(&pcb_resize_mutex); - return ret; -} - -void erofs_pcpubuf_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - - raw_spin_lock_init(&pcb->lock); - } -} - -void erofs_pcpubuf_exit(void) -{ - int cpu, i; - - for_each_possible_cpu(cpu) { - struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); - - if (pcb->ptr) { - vunmap(pcb->ptr); - pcb->ptr = NULL; - } - if (!pcb->pages) - continue; - - for (i = 0; i < pcb->nrpages; ++i) - if (pcb->pages[i]) - put_page(pcb->pages[i]); - kfree(pcb->pages); - pcb->pages = NULL; - } -} diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 626a615dafc2..937a215f626c 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -4,16 +4,13 @@ * https://www.huawei.com/ * Copyright (C) 2021, Alibaba Cloud */ -#include <linux/module.h> -#include <linux/buffer_head.h> #include <linux/statfs.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> -#include <linux/dax.h> #include <linux/exportfs.h> +#include <linux/backing-dev.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -21,58 +18,42 @@ static struct kmem_cache *erofs_inode_cachep __read_mostly; -void _erofs_err(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_printk(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; + int level; va_start(args, fmt); - vaf.fmt = fmt; + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - - pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); - va_end(args); -} - -void _erofs_info(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - pr_info("(device %s): %pV", sb->s_id, &vaf); + if (sb) + printk("%c%cerofs (device %s): %pV", + KERN_SOH_ASCII, level, sb->s_id, &vaf); + else + printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf); va_end(args); } static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) { - struct erofs_super_block *dsb; - u32 expected_crc, crc; - - dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, - EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL); - if (!dsb) - return -ENOMEM; + struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET; + u32 len = 1 << EROFS_SB(sb)->blkszbits, crc; - expected_crc = le32_to_cpu(dsb->checksum); - dsb->checksum = 0; - /* to allow for x86 boot sectors and other oddities. */ - crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET); - kfree(dsb); + if (len > EROFS_SUPER_OFFSET) + len -= EROFS_SUPER_OFFSET; + len -= offsetof(struct erofs_super_block, checksum) + + sizeof(dsb->checksum); - if (crc != expected_crc) { - erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", - crc, expected_crc); - return -EBADMSG; - } - return 0; + /* skip .magic(pre-verified) and .checksum(0) fields */ + crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len); + if (crc == le32_to_cpu(dsb->checksum)) + return 0; + erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", + crc, le32_to_cpu(dsb->checksum)); + return -EBADMSG; } static void erofs_inode_init_once(void *ptr) @@ -99,45 +80,25 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - /* be careful of RCU symlink path */ if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); - kmem_cache_free(erofs_inode_cachep, vi); } -static bool check_layout_compatibility(struct super_block *sb, - struct erofs_super_block *dsb) -{ - const unsigned int feature = le32_to_cpu(dsb->feature_incompat); - - EROFS_SB(sb)->feature_incompat = feature; - - /* check if current kernel meets all mandatory requirements */ - if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, - "unidentified incompatible feature %x, please upgrade kernel version", - feature & ~EROFS_ALL_FEATURE_INCOMPAT); - return false; - } - return true; -} - -#ifdef CONFIG_EROFS_FS_ZIP /* read variable-sized metadata, offset will be aligned by 4-byte */ -static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, - erofs_off_t *offset, int *lengthp) +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp) { u8 *buffer, *ptr; int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) return ptr; - len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); + len = le16_to_cpu(*(__le16 *)ptr); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); @@ -147,76 +108,28 @@ static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, *lengthp = len; for (i = 0; i < len; i += cnt) { - cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), - EROFS_KMAP); + cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), + len - i); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } - memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); + memcpy(buffer + i, ptr, cnt); *offset += cnt; } return buffer; } -static int erofs_load_compr_cfgs(struct super_block *sb, - struct erofs_super_block *dsb) +#ifndef CONFIG_EROFS_FS_ZIP +static int z_erofs_parse_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) { - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - unsigned int algs, alg; - erofs_off_t offset; - int size, ret = 0; - - sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); - if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { - erofs_err(sb, "try to load compressed fs with unsupported algorithms %x", - sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); - return -EINVAL; - } - - offset = EROFS_SUPER_OFFSET + sbi->sb_size; - alg = 0; - for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { - void *data; - - if (!(algs & 1)) - continue; - - data = erofs_read_metadata(sb, &buf, &offset, &size); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } + if (!dsb->u1.available_compr_algs) + return 0; - switch (alg) { - case Z_EROFS_COMPRESSION_LZ4: - ret = z_erofs_load_lz4_config(sb, dsb, data, size); - break; - case Z_EROFS_COMPRESSION_LZMA: - ret = z_erofs_load_lzma_config(sb, dsb, data, size); - break; - default: - DBG_BUGON(1); - ret = -EFAULT; - } - kfree(data); - if (ret) - break; - } - erofs_put_metabuf(&buf); - return ret; -} -#else -static int erofs_load_compr_cfgs(struct super_block *sb, - struct erofs_super_block *dsb) -{ - if (dsb->u1.available_compr_algs) { - erofs_err(sb, "try to load compressed fs when compression is disabled"); - return -EINVAL; - } - return 0; + erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); + return -EOPNOTSUPP; } #endif @@ -226,15 +139,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct block_device *bdev; - void *ptr; + struct file *file; - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - dis = ptr + erofs_blkoff(*pos); + dis = erofs_read_metabuf(buf, sb, *pos, false); + if (IS_ERR(dis)) + return PTR_ERR(dis); - if (!dif->path) { + if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; @@ -249,18 +160,34 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; - } else { - bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, - sb->s_type); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, - NULL, NULL); + } else if (!sbi->devs->flatdev) { + file = erofs_is_fileio_mode(sbi) ? + filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : + bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, NULL); + if (IS_ERR(file)) { + if (file == ERR_PTR(-ENOTBLK)) + return -EINVAL; + return PTR_ERR(file); + } + + if (!erofs_is_fileio_mode(sbi)) { + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), + &dif->dax_part_off, NULL, NULL); + } else if (!S_ISREG(file_inode(file)->i_mode)) { + fput(file); + return -EINVAL; + } + if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) { + erofs_info(sb, "DAX unsupported by %s. Turning off DAX.", + dif->path); + clear_opt(&sbi->opt, DAX_ALWAYS); + } + dif->file = file; } - dif->blocks = le32_to_cpu(dis->blocks); - dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + dif->blocks = le32_to_cpu(dis->blocks_lo); + dif->uniaddr = le32_to_cpu(dis->uniaddr_lo); sbi->total_blocks += dif->blocks; *pos += EROFS_DEVT_SLOT_SIZE; return 0; @@ -276,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb, struct erofs_device_info *dif; int id, err = 0; - sbi->total_blocks = sbi->primarydevice_blocks; + sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else @@ -288,9 +215,17 @@ static int erofs_scan_devices(struct super_block *sb, ondisk_extradevs, sbi->devs->extra_devices); return -EINVAL; } + + if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) { + erofs_info(sb, "DAX unsupported by block device. Turning off DAX."); + clear_opt(&sbi->opt, DAX_ALWAYS); + } if (!ondisk_extradevs) return 0; + if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) + sbi->devs->flatdev = true; + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); @@ -327,28 +262,35 @@ static int erofs_scan_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { - struct erofs_sb_info *sbi; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; - unsigned int blkszbits; void *data; int ret; - data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); + data = erofs_read_metabuf(&buf, sb, 0, false); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); } - sbi = EROFS_SB(sb); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); - ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); goto out; } + sbi->blkszbits = dsb->blkszbits; + if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { + erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); + goto out; + } + if (dsb->dirblkbits) { + erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits); + goto out; + } + sbi->feature_compat = le32_to_cpu(dsb->feature_compat); if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); @@ -357,106 +299,93 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - blkszbits = dsb->blkszbits; - /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ - if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err(sb, "blkszbits %u isn't supported on this platform", - blkszbits); + sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); + if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; } - if (!check_layout_compatibility(sb, dsb)) - goto out; - sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; - if (sbi->sb_size > EROFS_BLKSIZ) { + if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", sbi->sb_size); goto out; } - sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); + sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); + sbi->xattr_prefix_count = dsb->xattr_prefix_count; + sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid = le16_to_cpu(dsb->root_nid); -#ifdef CONFIG_EROFS_FS_ZIP - sbi->packed_inode = NULL; - if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { - sbi->packed_inode = - erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); - if (IS_ERR(sbi->packed_inode)) { - ret = PTR_ERR(sbi->packed_inode); - goto out; - } + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); + sbi->dif0.blocks = sbi->dif0.blocks | + ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32); + } else { + sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); + } + sbi->packed_nid = le64_to_cpu(dsb->packed_nid); + if (erofs_sb_has_metabox(sbi)) { + if (sbi->sb_size <= offsetof(struct erofs_super_block, + metabox_nid)) + return -EFSCORRUPTED; + sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid); + if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT)) + return -EFSCORRUPTED; /* self-loop detection */ } -#endif sbi->inos = le64_to_cpu(dsb->inos); - sbi->build_time = le64_to_cpu(dsb->build_time); - sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - - memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + sbi->epoch = (s64)le64_to_cpu(dsb->epoch); + sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); + super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); - ret = strscpy(sbi->volume_name, dsb->volume_name, - sizeof(dsb->volume_name)); - if (ret < 0) { /* -E2BIG */ - erofs_err(sb, "bad volume name without NIL terminator"); - ret = -EFSCORRUPTED; - goto out; + if (dsb->volume_name[0]) { + sbi->volume_name = kstrndup(dsb->volume_name, + sizeof(dsb->volume_name), GFP_KERNEL); + if (!sbi->volume_name) + return -ENOMEM; } /* parse on-disk compression configurations */ - if (erofs_sb_has_compr_cfgs(sbi)) - ret = erofs_load_compr_cfgs(sb, dsb); - else - ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) goto out; - /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); - if (erofs_sb_has_ztailpacking(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); + if (erofs_sb_has_48bit(sbi)) + erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!"); + if (erofs_sb_has_metabox(sbi)) + erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) - erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); - if (erofs_sb_has_fragments(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); - if (erofs_sb_has_dedupe(sbi)) - erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); + erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; } -/* set up default EROFS parameters */ -static void erofs_default_options(struct erofs_fs_context *ctx) +static void erofs_default_options(struct erofs_sb_info *sbi) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->opt.max_sync_decompress_pages = 3; - ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; + sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + sbi->opt.max_sync_decompress_pages = 3; + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(&ctx->opt, XATTR_USER); + set_opt(&sbi->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(&ctx->opt, POSIX_ACL); + set_opt(&sbi->opt, POSIX_ACL); #endif } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, - Opt_err + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset, }; static const struct constant_table erofs_param_cache_strategy[] = { @@ -482,23 +411,24 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), + fsparam_u64("fsoffset", Opt_fsoffset), {} }; static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) { #ifdef CONFIG_FS_DAX - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: - warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(&ctx->opt, DAX_ALWAYS); - clear_opt(&ctx->opt, DAX_NEVER); + set_opt(&sbi->opt, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(&ctx->opt, DAX_NEVER); - clear_opt(&ctx->opt, DAX_ALWAYS); + set_opt(&sbi->opt, DAX_NEVER); + clear_opt(&sbi->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -513,7 +443,7 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; struct erofs_device_info *dif; int opt, ret; @@ -526,9 +456,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(&ctx->opt, XATTR_USER); + set_opt(&sbi->opt, XATTR_USER); else - clear_opt(&ctx->opt, XATTR_USER); + clear_opt(&sbi->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -536,16 +466,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(&ctx->opt, POSIX_ACL); + set_opt(&sbi->opt, POSIX_ACL); else - clear_opt(&ctx->opt, POSIX_ACL); + clear_opt(&sbi->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->opt.cache_strategy = result.uint_32; + sbi->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -567,27 +497,27 @@ static int erofs_fc_parse_param(struct fs_context *fc, kfree(dif); return -ENOMEM; } - down_write(&ctx->devs->rwsem); - ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); - up_write(&ctx->devs->rwsem); + down_write(&sbi->devs->rwsem); + ret = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&sbi->devs->rwsem); if (ret < 0) { kfree(dif->path); kfree(dif); return ret; } - ++ctx->devs->extra_devices; + ++sbi->devs->extra_devices; break; #ifdef CONFIG_EROFS_FS_ONDEMAND case Opt_fsid: - kfree(ctx->fsid); - ctx->fsid = kstrdup(param->string, GFP_KERNEL); - if (!ctx->fsid) + kfree(sbi->fsid); + sbi->fsid = kstrdup(param->string, GFP_KERNEL); + if (!sbi->fsid) return -ENOMEM; break; case Opt_domain_id: - kfree(ctx->domain_id); - ctx->domain_id = kstrdup(param->string, GFP_KERNEL); - if (!ctx->domain_id) + kfree(sbi->domain_id); + sbi->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!sbi->domain_id) return -ENOMEM; break; #else @@ -596,92 +526,69 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif - default: - return -ENOPARAM; + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; + case Opt_fsoffset: + sbi->dif0.fsoff = result.uint_64; + break; } return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) +static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} + erofs_nid_t nid = EROFS_I(inode)->nid; + int len = parent ? 6 : 3; -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } - if (!inode) - return -ENOMEM; + fh[0] = (u32)(nid >> 32); + fh[1] = (u32)(nid & 0xffffffff); + fh[2] = inode->i_generation; - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; + if (parent) { + nid = EROFS_I(parent)->nid; - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif + fh[3] = (u32)(nid >> 32); + fh[4] = (u32)(nid & 0xffffffff); + fh[5] = parent->i_generation; + } -static struct inode *erofs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - return erofs_iget(sb, ino); + *max_len = len; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - erofs_nfs_get_inode); + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) + return NULL; + + return d_obtain_alias(erofs_iget(sb, + ((u64)fid->raw[0] << 32) | fid->raw[1])); } static struct dentry *erofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - erofs_nfs_get_inode); + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) + return NULL; + + return d_obtain_alias(erofs_iget(sb, + ((u64)fid->raw[3] << 32) | fid->raw[4])); } static struct dentry *erofs_get_parent(struct dentry *child) @@ -697,23 +604,32 @@ static struct dentry *erofs_get_parent(struct dentry *child) } static const struct export_operations erofs_export_ops = { + .encode_fh = erofs_encode_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, }; -static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) +static void erofs_set_sysfs_name(struct super_block *sb) { - static const struct tree_descr empty_descr = {""}; + struct erofs_sb_info *sbi = EROFS_SB(sb); - return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, + sbi->fsid); + else if (sbi->fsid) + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + else if (erofs_is_fileio_mode(sbi)) + super_set_sysfs_name_generic(sb, "%s", + bdi_dev_name(sb->s_bdi)); + else + super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; - struct erofs_sb_info *sbi; - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = EROFS_SB(sb); int err; sb->s_magic = EROFS_SUPER_MAGIC; @@ -721,54 +637,77 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &erofs_sops; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; - sbi->opt = ctx->opt; - sbi->devs = ctx->devs; - ctx->devs = NULL; - sbi->fsid = ctx->fsid; - ctx->fsid = NULL; - sbi->domain_id = ctx->domain_id; - ctx->domain_id = NULL; - - if (erofs_is_fscache_mode(sb)) { - sb->s_blocksize = EROFS_BLKSIZ; - sb->s_blocksize_bits = LOG_BLOCK_SIZE; - - err = erofs_fscache_register_fs(sb); - if (err) - return err; + sbi->blkszbits = PAGE_SHIFT; + if (!sb->s_bdev) { + /* + * (File-backed mounts) EROFS claims it's safe to nest other + * fs contexts (including its own) due to self-controlled RO + * accesses/contexts and no side-effect changes that need to + * context save & restore so it can reuse the current thread + * context. However, it still needs to bump `s_stack_depth` to + * avoid kernel stack overflow from nested filesystems. + */ + if (erofs_is_fileio_mode(sbi)) { + sb->s_stack_depth = + file_inode(sbi->dif0.file)->i_sb->s_stack_depth + 1; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + erofs_err(sb, "maximum fs stacking depth exceeded"); + return -ENOTBLK; + } + } + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } err = super_setup_bdi(sb); if (err) return err; } else { - if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { - erofs_err(sb, "failed to set erofs blksize"); + if (!sb_set_blocksize(sb, PAGE_SIZE)) { + errorfc(fc, "failed to set initial blksize"); return -EINVAL; } - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off, - NULL, NULL); + sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); if (err) return err; - if (test_opt(&sbi->opt, DAX_ALWAYS)) { - BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE); + if (sb->s_blocksize_bits != sbi->blkszbits) { + if (erofs_is_fscache_mode(sb)) { + errorfc(fc, "unsupported blksize for fscache mode"); + return -EINVAL; + } - if (!sbi->dax_dev) { - errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(&sbi->opt, DAX_ALWAYS); + if (erofs_is_fileio_mode(sbi)) { + sb->s_blocksize = 1 << sbi->blkszbits; + sb->s_blocksize_bits = sbi->blkszbits; + } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + errorfc(fc, "failed to set erofs blksize"); + return -EINVAL; } } + if (sbi->dif0.fsoff) { + if (sbi->dif0.fsoff & (sb->s_blocksize - 1)) + return invalfc(fc, "fsoffset %llu is not aligned to block size %lu", + sbi->dif0.fsoff, sb->s_blocksize); + if (erofs_is_fscache_mode(sb)) + return invalfc(fc, "cannot use fsoffset in fscache mode"); + } + + if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) { + erofs_info(sb, "unsupported blocksize for DAX"); + clear_opt(&sbi->opt, DAX_ALWAYS); + } + sb->s_time_gran = 1; sb->s_xattr = erofs_xattr_handlers; sb->s_export_op = &erofs_export_ops; @@ -778,72 +717,99 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) else sb->s_flags &= ~SB_POSIXACL; -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err = z_erofs_init_super(sb); + if (err) + return err; + + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode = inode; + } + if (erofs_sb_has_metabox(sbi)) { + inode = erofs_iget(sb, sbi->metabox_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->metabox_inode = inode; + } - /* get the root inode */ - inode = erofs_iget(sb, ROOT_NID(sbi)); + inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) return PTR_ERR(inode); if (!S_ISDIR(inode->i_mode)) { erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", - ROOT_NID(sbi), inode->i_mode); + sbi->root_nid, inode->i_mode); iput(inode); return -EINVAL; } - sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); - /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ - err = erofs_init_managed_cache(sb); + err = erofs_xattr_prefixes_init(sb); if (err) return err; + erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; - erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); + sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES; + erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } -static int erofs_fc_anon_get_tree(struct fs_context *fc) -{ - return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); -} - static int erofs_fc_get_tree(struct fs_context *fc) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev_flags(fc, erofs_fc_fill_super, + IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ? + GET_TREE_BDEV_QUIET_LOOKUP : 0); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + struct file *file; + + if (!fc->source) + return invalf(fc, "No source specified"); + file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + sbi->dif0.file = file; + + if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && + sbi->dif0.file->f_mapping->a_ops->read_folio) + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *new_sbi = fc->s_fs_info; DBG_BUGON(!sb_rdonly(sb)); - if (ctx->fsid || ctx->domain_id) + if (new_sbi->fsid || new_sbi->domain_id) erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); - if (test_opt(&ctx->opt, POSIX_ACL)) + if (test_opt(&new_sbi->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->opt = ctx->opt; + sbi->opt = new_sbi->opt; fc->sb_flags |= SB_RDONLY; return 0; @@ -854,8 +820,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev) - blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -872,14 +838,23 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) kfree(devs); } +static void erofs_sb_free(struct erofs_sb_info *sbi) +{ + erofs_free_dev_context(sbi->devs); + kfree(sbi->fsid); + kfree(sbi->domain_id); + if (sbi->dif0.file) + fput(sbi->dif0.file); + kfree(sbi->volume_name); + kfree(sbi); +} + static void erofs_fc_free(struct fs_context *fc) { - struct erofs_fs_context *ctx = fc->fs_private; + struct erofs_sb_info *sbi = fc->s_fs_info; - erofs_free_dev_context(ctx->devs); - kfree(ctx->fsid); - kfree(ctx->domain_id); - kfree(ctx); + if (sbi) /* free here if an error occurs before transferring to sb */ + erofs_sb_free(sbi); } static const struct fs_context_operations erofs_context_ops = { @@ -889,90 +864,70 @@ static const struct fs_context_operations erofs_context_ops = { .free = erofs_fc_free, }; -static const struct fs_context_operations erofs_anon_context_ops = { - .get_tree = erofs_fc_anon_get_tree, -}; - static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx; - - /* pseudo mount for anon inodes */ - if (fc->sb_flags & SB_KERNMOUNT) { - fc->ops = &erofs_anon_context_ops; - return 0; - } + struct erofs_sb_info *sbi; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) return -ENOMEM; - ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); - if (!ctx->devs) { - kfree(ctx); + + sbi->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!sbi->devs) { + kfree(sbi); return -ENOMEM; } - fc->fs_private = ctx; + fc->s_fs_info = sbi; - idr_init(&ctx->devs->tree); - init_rwsem(&ctx->devs->rwsem); - erofs_default_options(ctx); + idr_init(&sbi->devs->tree); + init_rwsem(&sbi->devs->rwsem); + erofs_default_options(sbi); fc->ops = &erofs_context_ops; return 0; } -/* - * could be triggered after deactivate_locked_super() - * is called, thus including umount and failed to initialize. - */ -static void erofs_kill_sb(struct super_block *sb) +static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) { - struct erofs_sb_info *sbi; - - WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + iput(sbi->packed_inode); + sbi->packed_inode = NULL; + iput(sbi->metabox_inode); + sbi->metabox_inode = NULL; +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} - /* pseudo mount for anon inodes */ - if (sb->s_flags & SB_KERNMOUNT) { - kill_anon_super(sb); - return; - } +static void erofs_kill_sb(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fscache_mode(sb)) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || + sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); - - sbi = EROFS_SB(sb); - if (!sbi) - return; - - erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev, NULL); + erofs_drop_internal_inodes(sbi); + fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->fsid); - kfree(sbi->domain_id); - kfree(sbi); + erofs_sb_free(sbi); sb->s_fs_info = NULL; } -/* called when ->s_root is non-NULL */ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - DBG_BUGON(!sbi); - erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); -#ifdef CONFIG_EROFS_FS_ZIP - iput(sbi->managed_cache); - sbi->managed_cache = NULL; - iput(sbi->packed_inode); - sbi->packed_inode = NULL; -#endif + erofs_xattr_prefixes_cleanup(sb); + erofs_drop_internal_inodes(sbi); + erofs_free_dev_context(sbi->devs); + sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } -struct file_system_type erofs_fs_type = { +static struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, @@ -988,24 +943,17 @@ static int __init erofs_module_init(void) erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", - sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT, - erofs_inode_init_once); - if (!erofs_inode_cachep) { - err = -ENOMEM; - goto icache_err; - } + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + erofs_inode_init_once); + if (!erofs_inode_cachep) + return -ENOMEM; err = erofs_init_shrinker(); if (err) goto shrinker_err; - err = z_erofs_lzma_init(); - if (err) - goto lzma_err; - - erofs_pcpubuf_init(); - err = z_erofs_init_zip_subsystem(); + err = z_erofs_init_subsystem(); if (err) goto zip_err; @@ -1022,14 +970,11 @@ static int __init erofs_module_init(void) fs_err: erofs_exit_sysfs(); sysfs_err: - z_erofs_exit_zip_subsystem(); + z_erofs_exit_subsystem(); zip_err: - z_erofs_lzma_exit(); -lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); -icache_err: return err; } @@ -1041,34 +986,29 @@ static void __exit erofs_module_exit(void) rcu_barrier(); erofs_exit_sysfs(); - z_erofs_exit_zip_subsystem(); - z_erofs_lzma_exit(); + z_erofs_exit_subsystem(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); - erofs_pcpubuf_exit(); } -/* get filesystem statistics */ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - u64 id = 0; - - if (!erofs_is_fscache_mode(sb)) - id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; - buf->f_bsize = EROFS_BLKSIZ; + buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; - buf->f_files = ULLONG_MAX; buf->f_ffree = ULLONG_MAX - sbi->inos; - buf->f_namelen = EROFS_NAME_LEN; - buf->f_fsid = u64_to_fsid(id); + if (uuid_is_null(&sb->s_uuid)) + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : + huge_encode_dev(sb->s_bdev->bd_dev)); + else + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } @@ -1077,43 +1017,47 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); struct erofs_mount_opts *opt = &sbi->opt; -#ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(opt, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - else - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(opt, POSIX_ACL)) - seq_puts(seq, ",acl"); - else - seq_puts(seq, ",noacl"); -#endif -#ifdef CONFIG_EROFS_FS_ZIP - if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) - seq_puts(seq, ",cache_strategy=disabled"); - else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) - seq_puts(seq, ",cache_strategy=readahead"); - else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) - seq_puts(seq, ",cache_strategy=readaround"); -#endif + if (IS_ENABLED(CONFIG_EROFS_FS_XATTR)) + seq_puts(seq, test_opt(opt, XATTR_USER) ? + ",user_xattr" : ",nouser_xattr"); + if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL)) + seq_puts(seq, test_opt(opt, POSIX_ACL) ? ",acl" : ",noacl"); + if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) + seq_printf(seq, ",cache_strategy=%s", + erofs_param_cache_strategy[opt->cache_strategy].name); if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); if (sbi->domain_id) seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif + if (sbi->dif0.fsoff) + seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff); return 0; } +static void erofs_evict_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + dax_break_layout_final(inode); +#endif + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, + .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index fd476961f742..1e0658a1d95b 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -7,11 +7,14 @@ #include <linux/kobject.h> #include "internal.h" +#include "compress.h" enum { attr_feature, + attr_drop_caches, attr_pointer_ui, attr_pointer_bool, + attr_accel, }; enum { @@ -57,11 +60,26 @@ static struct erofs_attr erofs_attr_##_name = { \ #ifdef CONFIG_EROFS_FS_ZIP EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +EROFS_ATTR_FUNC(drop_caches, 0200); #endif +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL +EROFS_ATTR_FUNC(accel, 0644); +#endif +EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info); -static struct attribute *erofs_attrs[] = { +static struct attribute *erofs_sb_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), + ATTR_LIST(drop_caches), +#endif + ATTR_LIST(dir_ra_bytes), + NULL, +}; +ATTRIBUTE_GROUPS(erofs_sb); + +static struct attribute *erofs_attrs[] = { +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL + ATTR_LIST(accel), #endif NULL, }; @@ -78,6 +96,8 @@ EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); +EROFS_ATTR_FEATURE(48bit); +EROFS_ATTR_FEATURE(metabox); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -90,6 +110,8 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), + ATTR_LIST(48bit), + ATTR_LIST(metabox), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); @@ -123,12 +145,14 @@ static ssize_t erofs_attr_show(struct kobject *kobj, if (!ptr) return 0; return sysfs_emit(buf, "%d\n", *(bool *)ptr); + case attr_accel: + return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n'); } return 0; } static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) + const char *buf, size_t len) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); @@ -163,6 +187,33 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, return -EINVAL; *(bool *)ptr = !!t; return len; +#ifdef CONFIG_EROFS_FS_ZIP + case attr_drop_caches: + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t < 1 || t > 3) + return -EINVAL; + + if (t & 2) + z_erofs_shrink_scan(sbi, ~0UL); + if (t & 1) + invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); + return len; +#endif +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL + case attr_accel: + buf = skip_spaces(buf); + z_erofs_crypto_disable_all_engines(); + while (*buf) { + t = strcspn(buf, "\n"); + ret = z_erofs_crypto_enable_engine(buf, t); + if (ret < 0) + return ret; + buf += buf[t] != '\0' ? t + 1 : t; + } + return len; +#endif } return 0; } @@ -179,13 +230,14 @@ static const struct sysfs_ops erofs_attr_ops = { .store = erofs_attr_store, }; -static struct kobj_type erofs_sb_ktype = { - .default_groups = erofs_groups, +static const struct kobj_type erofs_sb_ktype = { + .default_groups = erofs_sb_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; -static struct kobj_type erofs_ktype = { +static const struct kobj_type erofs_ktype = { + .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, }; @@ -193,7 +245,7 @@ static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; -static struct kobj_type erofs_feat_ktype = { +static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; @@ -205,34 +257,16 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - char *name; - char *str = NULL; int err; - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) { - str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, - sbi->fsid); - if (!str) - return -ENOMEM; - name = str; - } else { - name = sbi->fsid; - } - } else { - name = sb->s_id; - } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); - kfree(str); - if (err) - goto put_sb_kobj; - return 0; - -put_sb_kobj: - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", + sb->s_sysfs_name); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } return err; } @@ -247,6 +281,12 @@ void erofs_unregister_sysfs(struct super_block *sb) } } +void erofs_exit_sysfs(void) +{ + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +} + int __init erofs_init_sysfs(void) { int ret; @@ -254,24 +294,12 @@ int __init erofs_init_sysfs(void) kobject_set_name(&erofs_root.kobj, "erofs"); erofs_root.kobj.parent = fs_kobj; ret = kset_register(&erofs_root); - if (ret) - goto root_err; - - ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, - NULL, "features"); - if (ret) - goto feat_err; - return ret; - -feat_err: - kobject_put(&erofs_feat); - kset_unregister(&erofs_root); -root_err: + if (!ret) { + ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, + NULL, "features"); + if (!ret) + return 0; + erofs_exit_sysfs(); + } return ret; } - -void erofs_exit_sysfs(void) -{ - kobject_put(&erofs_feat); - kset_unregister(&erofs_root); -} diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h deleted file mode 100644 index 64ceb7270b5c..000000000000 --- a/fs/erofs/tagptr.h +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * A tagged pointer implementation - */ -#ifndef __EROFS_FS_TAGPTR_H -#define __EROFS_FS_TAGPTR_H - -#include <linux/types.h> -#include <linux/build_bug.h> - -/* - * the name of tagged pointer types are tagptr{1, 2, 3...}_t - * avoid directly using the internal structs __tagptr{1, 2, 3...} - */ -#define __MAKE_TAGPTR(n) \ -typedef struct __tagptr##n { \ - uintptr_t v; \ -} tagptr##n##_t; - -__MAKE_TAGPTR(1) -__MAKE_TAGPTR(2) -__MAKE_TAGPTR(3) -__MAKE_TAGPTR(4) - -#undef __MAKE_TAGPTR - -extern void __compiletime_error("bad tagptr tags") - __bad_tagptr_tags(void); - -extern void __compiletime_error("bad tagptr type") - __bad_tagptr_type(void); - -/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ -#define __tagptr_mask_1(ptr, n) \ - __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ - (1UL << (n)) - 1 : - -#define __tagptr_mask(ptr) (\ - __tagptr_mask_1(ptr, 1) ( \ - __tagptr_mask_1(ptr, 2) ( \ - __tagptr_mask_1(ptr, 3) ( \ - __tagptr_mask_1(ptr, 4) ( \ - __bad_tagptr_type(), 0))))) - -/* generate a tagged pointer from a raw value */ -#define tagptr_init(type, val) \ - ((typeof(type)){ .v = (uintptr_t)(val) }) - -/* - * directly cast a tagged pointer to the native pointer type, which - * could be used for backward compatibility of existing code. - */ -#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) - -/* encode tagged pointers */ -#define tagptr_fold(type, ptr, _tags) ({ \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ - __bad_tagptr_tags(); \ -tagptr_init(type, (uintptr_t)(ptr) | tags); }) - -/* decode tagged pointers */ -#define tagptr_unfold_ptr(tptr) \ - ((void *)((tptr).v & ~__tagptr_mask(tptr))) - -#define tagptr_unfold_tags(tptr) \ - ((tptr).v & __tagptr_mask(tptr)) - -/* operations for the tagger pointer */ -#define tagptr_eq(_tptr1, _tptr2) ({ \ - typeof(_tptr1) tptr1 = (_tptr1); \ - typeof(_tptr2) tptr2 = (_tptr2); \ - (void)(&tptr1 == &tptr2); \ -(tptr1).v == (tptr2).v; }) - -/* lock-free CAS operation */ -#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - typeof(_o) o = (_o); \ - typeof(_n) n = (_n); \ - (void)(&o == &n); \ - (void)(&o == ptptr); \ -tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) - -/* wrap WRITE_ONCE if atomic update is needed */ -#define tagptr_replace_tags(_ptptr, tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ -*ptptr; }) - -#define tagptr_set_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v |= tags; \ -*ptptr; }) - -#define tagptr_clear_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v &= ~tags; \ -*ptptr; }) - -#endif /* __EROFS_FS_TAGPTR_H */ diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c deleted file mode 100644 index 46627cb69abe..000000000000 --- a/fs/erofs/utils.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#include "internal.h" -#include <linux/pagevec.h> - -struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) -{ - struct page *page = *pagepool; - - if (page) { - DBG_BUGON(page_ref_count(page) != 1); - *pagepool = (struct page *)page_private(page); - } else { - page = alloc_page(gfp); - } - return page; -} - -void erofs_release_pages(struct page **pagepool) -{ - while (*pagepool) { - struct page *page = *pagepool; - - *pagepool = (struct page *)page_private(page); - put_page(page); - } -} - -#ifdef CONFIG_EROFS_FS_ZIP -/* global shrink count (for all mounted EROFS instances) */ -static atomic_long_t erofs_global_shrink_cnt; - -static int erofs_workgroup_get(struct erofs_workgroup *grp) -{ - int o; - -repeat: - o = erofs_wait_on_workgroup_freezed(grp); - if (o <= 0) - return -1; - - if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o) - goto repeat; - - /* decrease refcount paired by erofs_workgroup_put */ - if (o == 1) - atomic_long_dec(&erofs_global_shrink_cnt); - return 0; -} - -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_workgroup *grp; - -repeat: - rcu_read_lock(); - grp = xa_load(&sbi->managed_pslots, index); - if (grp) { - if (erofs_workgroup_get(grp)) { - /* prefer to relax rcu read side */ - rcu_read_unlock(); - goto repeat; - } - - DBG_BUGON(index != grp->index); - } - rcu_read_unlock(); - return grp; -} - -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct erofs_workgroup *pre; - - /* - * Bump up a reference count before making this visible - * to others for the XArray in order to avoid potential - * UAF without serialized by xa_lock. - */ - atomic_inc(&grp->refcount); - -repeat: - xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); - if (pre) { - if (xa_is_err(pre)) { - pre = ERR_PTR(xa_err(pre)); - } else if (erofs_workgroup_get(pre)) { - /* try to legitimize the current in-tree one */ - xa_unlock(&sbi->managed_pslots); - cond_resched(); - goto repeat; - } - atomic_dec(&grp->refcount); - grp = pre; - } - xa_unlock(&sbi->managed_pslots); - return grp; -} - -static void __erofs_workgroup_free(struct erofs_workgroup *grp) -{ - atomic_long_dec(&erofs_global_shrink_cnt); - erofs_workgroup_free_rcu(grp); -} - -int erofs_workgroup_put(struct erofs_workgroup *grp) -{ - int count = atomic_dec_return(&grp->refcount); - - if (count == 1) - atomic_long_inc(&erofs_global_shrink_cnt); - else if (!count) - __erofs_workgroup_free(grp); - return count; -} - -static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) -{ - /* - * If managed cache is on, refcount of workgroups - * themselves could be < 0 (freezed). In other words, - * there is no guarantee that all refcounts > 0. - */ - if (!erofs_workgroup_try_to_freeze(grp, 1)) - return false; - - /* - * Note that all cached pages should be unattached - * before deleted from the XArray. Otherwise some - * cached pages could be still attached to the orphan - * old workgroup when the new one is available in the tree. - */ - if (erofs_try_to_free_all_cached_pages(sbi, grp)) { - erofs_workgroup_unfreeze(grp, 1); - return false; - } - - /* - * It's impossible to fail after the workgroup is freezed, - * however in order to avoid some race conditions, add a - * DBG_BUGON to observe this in advance. - */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - - /* last refcount should be connected with its managed pslot. */ - erofs_workgroup_unfreeze(grp, 0); - __erofs_workgroup_free(grp); - return true; -} - -static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink) -{ - struct erofs_workgroup *grp; - unsigned int freed = 0; - unsigned long index; - - xa_lock(&sbi->managed_pslots); - xa_for_each(&sbi->managed_pslots, index, grp) { - /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp)) - continue; - xa_unlock(&sbi->managed_pslots); - - ++freed; - if (!--nr_shrink) - return freed; - xa_lock(&sbi->managed_pslots); - } - xa_unlock(&sbi->managed_pslots); - return freed; -} - -/* protected by 'erofs_sb_list_lock' */ -static unsigned int shrinker_run_no; - -/* protects the mounted 'erofs_sb_list' */ -static DEFINE_SPINLOCK(erofs_sb_list_lock); -static LIST_HEAD(erofs_sb_list); - -void erofs_shrinker_register(struct super_block *sb) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - - mutex_init(&sbi->umount_mutex); - - spin_lock(&erofs_sb_list_lock); - list_add(&sbi->list, &erofs_sb_list); - spin_unlock(&erofs_sb_list_lock); -} - -void erofs_shrinker_unregister(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - - mutex_lock(&sbi->umount_mutex); - /* clean up all remaining workgroups in memory */ - erofs_shrink_workstation(sbi, ~0UL); - - spin_lock(&erofs_sb_list_lock); - list_del(&sbi->list); - spin_unlock(&erofs_sb_list_lock); - mutex_unlock(&sbi->umount_mutex); -} - -static unsigned long erofs_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return atomic_long_read(&erofs_global_shrink_cnt); -} - -static unsigned long erofs_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct erofs_sb_info *sbi; - struct list_head *p; - - unsigned long nr = sc->nr_to_scan; - unsigned int run_no; - unsigned long freed = 0; - - spin_lock(&erofs_sb_list_lock); - do { - run_no = ++shrinker_run_no; - } while (run_no == 0); - - /* Iterate over all mounted superblocks and try to shrink them */ - p = erofs_sb_list.next; - while (p != &erofs_sb_list) { - sbi = list_entry(p, struct erofs_sb_info, list); - - /* - * We move the ones we do to the end of the list, so we stop - * when we see one we have already done. - */ - if (sbi->shrinker_run_no == run_no) - break; - - if (!mutex_trylock(&sbi->umount_mutex)) { - p = p->next; - continue; - } - - spin_unlock(&erofs_sb_list_lock); - sbi->shrinker_run_no = run_no; - - freed += erofs_shrink_workstation(sbi, nr - freed); - - spin_lock(&erofs_sb_list_lock); - /* Get the next list element before we move this one */ - p = p->next; - - /* - * Move this one to the end of the list to provide some - * fairness. - */ - list_move_tail(&sbi->list, &erofs_sb_list); - mutex_unlock(&sbi->umount_mutex); - - if (freed >= nr) - break; - } - spin_unlock(&erofs_sb_list_lock); - return freed; -} - -static struct shrinker erofs_shrinker_info = { - .scan_objects = erofs_shrink_scan, - .count_objects = erofs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; - -int __init erofs_init_shrinker(void) -{ - return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); -} - -void erofs_exit_shrinker(void) -{ - unregister_shrinker(&erofs_shrinker_info); -} -#endif /* !CONFIG_EROFS_FS_ZIP */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a62fb8a3318a..396536d9a862 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -5,25 +5,33 @@ * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> +#include <linux/xxhash.h> #include "xattr.h" -struct xattr_iter { +struct erofs_xattr_iter { struct super_block *sb; struct erofs_buf buf; + erofs_off_t pos; void *kaddr; - erofs_blk_t blkaddr; - unsigned int ofs; + char *buffer; + int buffer_size, buffer_ofs; + + /* getxattr */ + int index, infix_len; + struct qstr name; + + /* listxattr */ + struct dentry *dentry; }; -static int init_inode_xattrs(struct inode *inode) +static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct xattr_iter it; + struct erofs_xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; - struct super_block *sb; - struct erofs_sb_info *sbi; + struct super_block *sb = inode->i_sb; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -52,37 +60,37 @@ static int init_inode_xattrs(struct inode *inode) * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { - erofs_err(inode->i_sb, + erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { if (vi->xattr_isize) { - erofs_err(inode->i_sb, - "bogus xattr ibody @ nid %llu", vi->nid); + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); DBG_BUGON(1); ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ } - ret = -ENOATTR; + ret = -ENODATA; goto out_unlock; } - sb = inode->i_sb; - sbi = EROFS_SB(sb); it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); - it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode)); + if (ret) + goto out_unlock; + it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + ih = it.kaddr; + vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -93,26 +101,18 @@ static int init_inode_xattrs(struct inode *inode) } /* let's skip ibody header */ - it.ofs += sizeof(struct erofs_xattr_ibody_header); + it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - if (it.ofs >= EROFS_BLKSIZ) { - /* cannot be unaligned */ - DBG_BUGON(it.ofs != EROFS_BLKSIZ); - - it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, - EROFS_KMAP); - if (IS_ERR(it.kaddr)) { - kfree(vi->xattr_shared_xattrs); - vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.kaddr); - goto out_unlock; - } - it.ofs = 0; + it.kaddr = erofs_bread(&it.buf, it.pos, true); + if (IS_ERR(it.kaddr)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.kaddr); + goto out_unlock; } - vi->xattr_shared_xattrs[i] = - le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); - it.ofs += sizeof(__le32); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr); + it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -125,261 +125,6 @@ out_unlock: return ret; } -/* - * the general idea for these return values is - * if 0 is returned, go on processing the current xattr; - * 1 (> 0) is returned, skip this round to process the next xattr; - * -err (< 0) is returned, an error (maybe ENOXATTR) occurred - * and need to be handled - */ -struct xattr_iter_handlers { - int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); - int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); - int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); - void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); -}; - -static inline int xattr_iter_fixup(struct xattr_iter *it) -{ - if (it->ofs < EROFS_BLKSIZ) - return 0; - - it->blkaddr += erofs_blknr(it->ofs); - it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - it->ofs = erofs_blkoff(it->ofs); - return 0; -} - -static int inline_xattr_iter_begin(struct xattr_iter *it, - struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); - unsigned int xattr_header_sz, inline_xattr_ofs; - - xattr_header_sz = inlinexattr_header_size(inode); - if (xattr_header_sz >= vi->xattr_isize) { - DBG_BUGON(xattr_header_sz > vi->xattr_isize); - return -ENOATTR; - } - - inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - - it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - - it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - return vi->xattr_isize - xattr_header_sz; -} - -/* - * Regardless of success or failure, `xattr_foreach' will end up with - * `ofs' pointing to the next xattr item rather than an arbitrary position. - */ -static int xattr_foreach(struct xattr_iter *it, - const struct xattr_iter_handlers *op, - unsigned int *tlimit) -{ - struct erofs_xattr_entry entry; - unsigned int value_sz, processed, slice; - int err; - - /* 0. fixup blkaddr, ofs, ipage */ - err = xattr_iter_fixup(it); - if (err) - return err; - - /* - * 1. read xattr entry to the memory, - * since we do EROFS_XATTR_ALIGN - * therefore entry should be in the page - */ - entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); - if (tlimit) { - unsigned int entry_sz = erofs_xattr_entry_size(&entry); - - /* xattr on-disk corruption: xattr entry beyond xattr_isize */ - if (*tlimit < entry_sz) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - *tlimit -= entry_sz; - } - - it->ofs += sizeof(struct erofs_xattr_entry); - value_sz = le16_to_cpu(entry.e_value_size); - - /* handle entry */ - err = op->entry(it, &entry); - if (err) { - it->ofs += entry.e_name_len + value_sz; - goto out; - } - - /* 2. handle xattr name (ofs will finally be at the end of name) */ - processed = 0; - - while (processed < entry.e_name_len) { - if (it->ofs >= EROFS_BLKSIZ) { - DBG_BUGON(it->ofs > EROFS_BLKSIZ); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, - entry.e_name_len - processed); - - /* handle name */ - err = op->name(it, processed, it->kaddr + it->ofs, slice); - if (err) { - it->ofs += entry.e_name_len - processed + value_sz; - goto out; - } - - it->ofs += slice; - processed += slice; - } - - /* 3. handle xattr value */ - processed = 0; - - if (op->alloc_buffer) { - err = op->alloc_buffer(it, value_sz); - if (err) { - it->ofs += value_sz; - goto out; - } - } - - while (processed < value_sz) { - if (it->ofs >= EROFS_BLKSIZ) { - DBG_BUGON(it->ofs > EROFS_BLKSIZ); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, - value_sz - processed); - op->value(it, processed, it->kaddr + it->ofs, slice); - it->ofs += slice; - processed += slice; - } - -out: - /* xattrs should be 4-byte aligned (on-disk constraint) */ - it->ofs = EROFS_XATTR_ALIGN(it->ofs); - return err < 0 ? err : 0; -} - -struct getxattr_iter { - struct xattr_iter it; - - char *buffer; - int buffer_size, index; - struct qstr name; -}; - -static int xattr_entrymatch(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - return (it->index != entry->e_name_index || - it->name.len != entry->e_name_len) ? -ENOATTR : 0; -} - -static int xattr_namematch(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; -} - -static int xattr_checkbuffer(struct xattr_iter *_it, - unsigned int value_sz) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - int err = it->buffer_size < value_sz ? -ERANGE : 0; - - it->buffer_size = value_sz; - return !it->buffer ? 1 : err; -} - -static void xattr_copyvalue(struct xattr_iter *_it, - unsigned int processed, - char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - memcpy(it->buffer + processed, buf, len); -} - -static const struct xattr_iter_handlers find_xattr_handlers = { - .entry = xattr_entrymatch, - .name = xattr_namematch, - .alloc_buffer = xattr_checkbuffer, - .value = xattr_copyvalue -}; - -static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - int ret; - unsigned int remaining; - - ret = inline_xattr_iter_begin(&it->it, inode); - if (ret < 0) - return ret; - - remaining = ret; - while (remaining) { - ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - -static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int i; - int ret = -ENOATTR; - - for (i = 0; i < vi->xattr_shared_count; ++i) { - erofs_blk_t blkaddr = - xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); - - it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - it->it.blkaddr = blkaddr; - - ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - static bool erofs_xattr_user_list(struct dentry *dentry) { return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); @@ -390,57 +135,13 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry) return capable(CAP_SYS_ADMIN); } -int erofs_getxattr(struct inode *inode, int index, - const char *name, - void *buffer, size_t buffer_size) -{ - int ret; - struct getxattr_iter it; - - if (!name) - return -EINVAL; - - ret = init_inode_xattrs(inode); - if (ret) - return ret; - - it.index = index; - it.name.len = strlen(name); - if (it.name.len > EROFS_NAME_LEN) - return -ERANGE; - - it.it.buf = __EROFS_BUF_INITIALIZER; - it.name.name = name; - - it.buffer = buffer; - it.buffer_size = buffer_size; - - it.it.sb = inode->i_sb; - ret = inline_getxattr(inode, &it); - if (ret == -ENOATTR) - ret = shared_getxattr(inode, &it); - erofs_put_metabuf(&it.it.buf); - return ret; -} - static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - switch (handler->flags) { - case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->opt, XATTR_USER)) - return -EOPNOTSUPP; - break; - case EROFS_XATTR_INDEX_TRUSTED: - break; - case EROFS_XATTR_INDEX_SECURITY: - break; - default: - return -EINVAL; - } + if (handler->flags == EROFS_XATTR_INDEX_USER && + !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) + return -EOPNOTSUPP; return erofs_getxattr(inode, handler->flags, name, buffer, size); } @@ -467,12 +168,8 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { }; #endif -const struct xattr_handler *erofs_xattr_handlers[] = { +const struct xattr_handler * const erofs_xattr_handlers[] = { &erofs_xattr_user_handler, -#ifdef CONFIG_EROFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY &erofs_xattr_security_handler, @@ -480,142 +177,351 @@ const struct xattr_handler *erofs_xattr_handlers[] = { NULL, }; -struct listxattr_iter { - struct xattr_iter it; - - struct dentry *dentry; - char *buffer; - int buffer_size, buffer_ofs; -}; - -static int xattr_entrylist(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) +static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, + unsigned int len) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); - unsigned int prefix_len; - const char *prefix; + unsigned int slice, processed; + struct super_block *sb = it->sb; + void *src; + + for (processed = 0; processed < len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, it->pos, true); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + src = it->kaddr; + slice = min_t(unsigned int, sb->s_blocksize - + erofs_blkoff(sb, it->pos), len - processed); + memcpy(it->buffer + it->buffer_ofs, src, slice); + it->buffer_ofs += slice; + it->pos += slice; + } + return 0; +} - const struct xattr_handler *h = - erofs_xattr_handler(entry->e_name_index); +static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) +{ + struct erofs_xattr_entry entry; + unsigned int base_index, name_total, prefix_len, infix_len = 0; + const char *prefix, *infix = NULL; + int err; - if (!h || (h->list && !h->list(it->dentry))) - return 1; + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *)it->kaddr; + it->pos += sizeof(struct erofs_xattr_entry); + + base_index = entry.e_name_index; + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(it->sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return 0; + infix = pf->prefix->infix; + infix_len = pf->infix_len; + base_index = pf->prefix->base_index; + } - prefix = xattr_prefix(h); + prefix = erofs_xattr_prefix(base_index, it->dentry); + if (!prefix) + return 0; prefix_len = strlen(prefix); + name_total = prefix_len + infix_len + entry.e_name_len + 1; if (!it->buffer) { - it->buffer_ofs += prefix_len + entry->e_name_len + 1; - return 1; + it->buffer_ofs += name_total; + return 0; } - if (it->buffer_ofs + prefix_len - + entry->e_name_len + 1 > it->buffer_size) + if (it->buffer_ofs + name_total > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); - it->buffer_ofs += prefix_len; - return 0; -} + memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); + it->buffer_ofs += prefix_len + infix_len; -static int xattr_namelist(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + /* 2. handle xattr name */ + err = erofs_xattr_copy_to_buffer(it, entry.e_name_len); + if (err) + return err; - memcpy(it->buffer + it->buffer_ofs, buf, len); - it->buffer_ofs += len; + it->buffer[it->buffer_ofs++] = '\0'; return 0; } -static int xattr_skipvalue(struct xattr_iter *_it, - unsigned int value_sz) +static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + struct super_block *sb = it->sb; + struct erofs_xattr_entry entry; + unsigned int slice, processed, value_sz; - it->buffer[it->buffer_ofs++] = '\0'; - return 1; -} + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *)it->kaddr; + it->pos += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); -static const struct xattr_iter_handlers list_xattr_handlers = { - .entry = xattr_entrylist, - .name = xattr_namelist, - .alloc_buffer = xattr_skipvalue, - .value = NULL -}; + /* should also match the infix for long name prefixes */ + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENODATA; + + if (it->index != pf->prefix->base_index || + it->name.len != entry.e_name_len + pf->infix_len) + return -ENODATA; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENODATA; -static int inline_listxattr(struct listxattr_iter *it) + it->infix_len = pf->infix_len; + } else { + if (it->index != entry.e_name_index || + it->name.len != entry.e_name_len) + return -ENODATA; + + it->infix_len = 0; + } + + /* 2. handle xattr name */ + for (processed = 0; processed < entry.e_name_len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, it->pos, true); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + slice = min_t(unsigned int, + sb->s_blocksize - erofs_blkoff(sb, it->pos), + entry.e_name_len - processed); + if (memcmp(it->name.name + it->infix_len + processed, + it->kaddr, slice)) + return -ENODATA; + it->pos += slice; + } + + /* 3. handle xattr value */ + if (!it->buffer) { + it->buffer_ofs = value_sz; + return 0; + } + + if (it->buffer_size < value_sz) + return -ERANGE; + + return erofs_xattr_copy_to_buffer(it, value_sz); +} + +static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { + struct erofs_inode *const vi = EROFS_I(inode); + unsigned int xattr_header_sz, remaining, entry_sz; + erofs_off_t next_pos; int ret; - unsigned int remaining; - ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); - if (ret < 0) + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENODATA; + } + + ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode)); + if (ret) return ret; + remaining = vi->xattr_isize - xattr_header_sz; + it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; - remaining = ret; while (remaining) { - ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); - if (ret) + it->kaddr = erofs_bread(&it->buf, it->pos, true); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + entry_sz = erofs_xattr_entry_size(it->kaddr); + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (remaining < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + remaining -= entry_sz; + next_pos = it->pos + entry_sz; + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENODATA) || (!getxattr && ret)) break; + + it->pos = next_pos; } - return ret ? ret : it->buffer_ofs; + return ret; } -static int shared_listxattr(struct listxattr_iter *it) +static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { - struct inode *const inode = d_inode(it->dentry); struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int i; - int ret = 0; + struct super_block *const sb = it->sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int i = 0; + int ret; - for (i = 0; i < vi->xattr_shared_count; ++i) { - erofs_blk_t blkaddr = - xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); - - it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - it->it.blkaddr = blkaddr; - - ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); - if (ret) + ret = erofs_init_metabuf(&it->buf, sb, + erofs_sb_has_shared_ea_in_metabox(sbi)); + if (ret) + return ret; + + while (i < vi->xattr_shared_count) { + it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + + vi->xattr_shared_xattrs[i++] * sizeof(__le32); + it->kaddr = erofs_bread(&it->buf, it->pos, true); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENODATA) || (!getxattr && ret)) break; } - return ret ? ret : it->buffer_ofs; + return i ? ret : -ENODATA; } -ssize_t erofs_listxattr(struct dentry *dentry, - char *buffer, size_t buffer_size) +int erofs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) { int ret; - struct listxattr_iter it; + unsigned int hashbit; + struct erofs_xattr_iter it; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + if (!name) + return -EINVAL; + + ret = erofs_init_inode_xattrs(inode); + if (ret) + return ret; + + /* reserved flag is non-zero if there's any change of on-disk format */ + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { + hashbit = xxh32(name, strlen(name), + EROFS_XATTR_FILTER_SEED + index); + hashbit &= EROFS_XATTR_FILTER_BITS - 1; + if (vi->xattr_name_filter & (1U << hashbit)) + return -ENODATA; + } + + it.index = index; + it.name = QSTR(name); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; - ret = init_inode_xattrs(d_inode(dentry)); - if (ret == -ENOATTR) + it.sb = inode->i_sb; + it.buf = __EROFS_BUF_INITIALIZER; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = erofs_xattr_iter_inline(&it, inode, true); + if (ret == -ENODATA) + ret = erofs_xattr_iter_shared(&it, inode, true); + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int ret; + struct erofs_xattr_iter it; + struct inode *inode = d_inode(dentry); + + ret = erofs_init_inode_xattrs(inode); + if (ret == -ENODATA) return 0; if (ret) return ret; - it.it.buf = __EROFS_BUF_INITIALIZER; + it.sb = dentry->d_sb; + it.buf = __EROFS_BUF_INITIALIZER; it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; - it.it.sb = dentry->d_sb; + ret = erofs_xattr_iter_inline(&it, inode, false); + if (!ret || ret == -ENODATA) + ret = erofs_xattr_iter_shared(&it, inode, false); + if (ret == -ENODATA) + ret = 0; + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; +} - ret = inline_listxattr(&it); - if (ret >= 0 || ret == -ENOATTR) - ret = shared_listxattr(&it); - erofs_put_metabuf(&it.it.buf); +void erofs_xattr_prefixes_cleanup(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int i; + + if (sbi->xattr_prefixes) { + for (i = 0; i < sbi->xattr_prefix_count; i++) + kfree(sbi->xattr_prefixes[i].prefix); + kfree(sbi->xattr_prefixes); + sbi->xattr_prefixes = NULL; + } +} + +int erofs_xattr_prefixes_init(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; + struct erofs_xattr_prefix_item *pfs; + int ret = 0, i, len; + bool plain = erofs_sb_has_plain_xattr_pfx(sbi); + + if (!sbi->xattr_prefix_count) + return 0; + + pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL); + if (!pfs) + return -ENOMEM; + + if (!plain) { + if (erofs_sb_has_metabox(sbi)) + (void)erofs_init_metabuf(&buf, sb, true); + else if (sbi->packed_inode) + buf.mapping = sbi->packed_inode->i_mapping; + else + plain = true; + } + if (plain) + (void)erofs_init_metabuf(&buf, sb, false); + + for (i = 0; i < sbi->xattr_prefix_count; i++) { + void *ptr = erofs_read_metadata(sb, &buf, &pos, &len); + + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + break; + } else if (len < sizeof(*pfs->prefix) || + len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) { + kfree(ptr); + ret = -EFSCORRUPTED; + break; + } + pfs[i].prefix = ptr; + pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix); + } + + erofs_put_metabuf(&buf); + sbi->xattr_prefixes = pfs; + if (ret) + erofs_xattr_prefixes_cleanup(sb); return ret; } @@ -648,7 +554,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) rc = erofs_getxattr(inode, prefix, "", value, rc); } - if (rc == -ENOATTR) + if (rc == -ENODATA) acl = NULL; else if (rc < 0) acl = ERR_PTR(rc); diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 0a43c9ee9f8f..6317caa8413e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -10,46 +10,21 @@ #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> -/* Attribute not found */ -#define ENOATTR ENODATA - -static inline unsigned int inlinexattr_header_size(struct inode *inode) -{ - return sizeof(struct erofs_xattr_ibody_header) + - sizeof(u32) * EROFS_I(inode)->xattr_shared_count; -} - -static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, - unsigned int xattr_id) -{ -#ifdef CONFIG_EROFS_FS_XATTR - return sbi->xattr_blkaddr + - xattr_id * sizeof(__u32) / EROFS_BLKSIZ; -#else - return 0; -#endif -} - -static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, - unsigned int xattr_id) -{ - return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; -} - #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; extern const struct xattr_handler erofs_xattr_security_handler; -static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) +static inline const char *erofs_xattr_prefix(unsigned int idx, + struct dentry *dentry) { - static const struct xattr_handler *xattr_handler_map[] = { + const struct xattr_handler *handler = NULL; + + static const struct xattr_handler * const xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL - [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = - &posix_acl_access_xattr_handler, - [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY @@ -57,15 +32,24 @@ static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) #endif }; - return idx && idx < ARRAY_SIZE(xattr_handler_map) ? - xattr_handler_map[idx] : NULL; + if (idx && idx < ARRAY_SIZE(xattr_handler_map)) + handler = xattr_handler_map[idx]; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } -extern const struct xattr_handler *erofs_xattr_handlers[]; +extern const struct xattr_handler * const erofs_xattr_handlers[]; +int erofs_xattr_prefixes_init(struct super_block *sb); +void erofs_xattr_prefixes_cleanup(struct super_block *sb); int erofs_getxattr(struct inode *, int, const char *, void *, size_t); ssize_t erofs_listxattr(struct dentry *, char *, size_t); #else +static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } +static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} static inline int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5200bb86e264..65da21504632 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -4,13 +4,116 @@ * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ -#include "zdata.h" #include "compress.h" -#include <linux/prefetch.h> #include <linux/psi.h> - +#include <linux/cpuhotplug.h> #include <trace/events/erofs.h> +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct mutex lock; + struct lockref lockref; + + /* A: point to next chained pcluster or TAILs */ + struct z_erofs_pcluster *next; + + /* I: start physical position of this pcluster */ + erofs_off_t pos; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* I: whether compressed data is in-lined or not */ + bool from_meta; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* the end of a chain of pclusters */ +#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) + +struct z_erofs_decompressqueue { + struct super_block *sb; + struct z_erofs_pcluster *head; + atomic_t pending_bios; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT; +} + +static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) +{ + return fo->mapping == MNGD_MAPPING(sbi); +} + +#define Z_EROFS_ONSTACK_PAGES 32 + /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. @@ -25,7 +128,7 @@ struct z_erofs_pcluster_slab { static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), - _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1) }; struct z_erofs_bvec_iter { @@ -76,14 +179,21 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; - + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + + if (!nextpage) { + nextpage = __erofs_allocpage(pagepool, GFP_KERNEL, + true); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -138,21 +248,20 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; return pcl; } return ERR_PTR(-EINVAL); @@ -175,121 +284,242 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * tagged pointer with 1-bit tag for all compressed pages - * tag 0 - the page is just found with an extra page reference - */ -typedef tagptr1_t compressed_page_t; +static struct workqueue_struct *z_erofs_workqueue __read_mostly; -#define tag_compressed_page_justfound(page) \ - tagptr_fold(compressed_page_t, page, 1) +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; +static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0); -static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} -void z_erofs_exit_zip_subsystem(void) +static struct kthread_worker *erofs_init_percpu_worker(int cpu) { - destroy_workqueue(z_erofs_workqueue); - z_erofs_destroy_pcluster_pool(); + struct kthread_worker *worker = + kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u"); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + return worker; } -static inline int z_erofs_init_workqueue(void) +static int erofs_init_percpu_workers(void) { - const unsigned int onlinecpus = num_possible_cpus(); + struct kthread_worker *worker; + unsigned int cpu; - /* - * no need to spawn too many threads, limiting threads could minimum - * scheduling overhead, perhaps per-CPU threads should be better? - */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", - WQ_UNBOUND | WQ_HIGHPRI, - onlinecpus + onlinecpus / 4); - return z_erofs_workqueue ? 0 : -ENOMEM; + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); } +#else /* !CONFIG_HOTPLUG_CPU */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif/* CONFIG_HOTPLUG_CPU */ +static int z_erofs_init_pcpu_workers(struct super_block *sb) +{ + int err; -int __init z_erofs_init_zip_subsystem(void) + if (atomic_xchg(&erofs_percpu_workers_initialized, 1)) + return 0; + + err = erofs_init_percpu_workers(); + if (err) { + erofs_err(sb, "per-cpu workers: failed to allocate."); + goto err_init_percpu_workers; + } + + err = erofs_cpu_hotplug_init(); + if (err < 0) { + erofs_err(sb, "per-cpu workers: failed CPU hotplug init."); + goto err_cpuhp_init; + } + erofs_info(sb, "initialized per-cpu workers successfully."); + return err; + +err_cpuhp_init: + erofs_destroy_percpu_workers(); +err_init_percpu_workers: + atomic_set(&erofs_percpu_workers_initialized, 0); + return err; +} + +static void z_erofs_destroy_pcpu_workers(void) +{ + if (!atomic_xchg(&erofs_percpu_workers_initialized, 0)) + return; + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); +} +#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; } +static inline void z_erofs_destroy_pcpu_workers(void) {} +#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */ + +void z_erofs_exit_subsystem(void) { - int err = z_erofs_create_pcluster_pool(); + z_erofs_destroy_pcpu_workers(); + destroy_workqueue(z_erofs_workqueue); + z_erofs_destroy_pcluster_pool(); + z_erofs_crypto_disable_all_engines(); + z_erofs_exit_decompressor(); +} + +int __init z_erofs_init_subsystem(void) +{ + int err = z_erofs_init_decompressor(); if (err) - return err; - err = z_erofs_init_workqueue(); + goto err_decompressor; + + err = z_erofs_create_pcluster_pool(); if (err) - z_erofs_destroy_pcluster_pool(); + goto err_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto err_workqueue_init; + } + + return err; + +err_workqueue_init: + z_erofs_destroy_pcluster_pool(); +err_pcluster_pool: + z_erofs_exit_decompressor(); +err_decompressor: return err; } enum z_erofs_pclustermode { + /* It has previously been linked into another processing chain */ Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current pclusters was the tail of an exist chain, in addition - * that the previous processed chained pclusters are all decided to - * be hooked up to it. - * A new chain will be created for the remaining pclusters which are - * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, - * the next pcluster cannot reuse the whole page safely for inplace I/O - * in the following scenario: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (belongs to the next pcl) | (belongs to the current pcl) | - * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| - */ - Z_EROFS_PCLUSTER_HOOKED, - /* - * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it - * could be dispatched into bypass queue later due to uptodated managed - * pages. All related online pages cannot be reused for inplace I/O (or - * bvpage) since it can be directly decoded without I/O submission. + * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it + * may be dispatched to the bypass queue later due to uptodated managed + * folios. All file-backed folios related to this pcluster cannot be + * reused for in-place I/O (or bvpage) since the pcluster may be decoded + * in a separate queue (and thus out of order). */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | PCLUSTER_FOLLOWED or | | - * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| - * - * [ (*) the above page can be used as inplace I/O. ] + * The pcluster has just been linked to our processing chain. + * File-backed folios (except for the head page) related to it can be + * used for in-place I/O (or bvpage). */ Z_EROFS_PCLUSTER_FOLLOWED, }; -struct z_erofs_decompress_frontend { +struct z_erofs_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; - struct z_erofs_pcluster *pcl, *tailpcl; - z_erofs_next_pcluster_t owned_head; + struct z_erofs_pcluster *pcl, *head; enum z_erofs_pclustermode mode; - bool readahead; - /* used for applying cache strategy on the fly */ - bool backmost; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ unsigned int icur; }; -#define DECOMPRESS_FRONTEND_INIT(__i) { \ - .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } +#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \ + .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho } -static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe) { unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; - if (fe->backmost) + if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && @@ -299,223 +529,237 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); - bool standalone = true; - /* - * optimistic allocation without direct reclaim since inplace I/O - * can be used if low memory otherwise. - */ + pgoff_t poff = pcl->pos >> PAGE_SHIFT; + bool may_bypass = true; + /* Optimistic allocation, as in-place I/O can be used as a fallback */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + struct folio *folio, *newfolio; unsigned int i; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; - compressed_page_t t; - struct page *newpage = NULL; - - /* the compressed page was loaded before */ + for (i = 0; i < pclusterpages; ++i) { + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, pcl->obj.index + i); - - if (page) { - t = tag_compressed_page_justfound(page); - } else { - /* I/O is needed, no possible to decompress directly */ - standalone = false; + folio = filemap_get_folio(mc, poff + i); + if (IS_ERR(folio)) { + may_bypass = false; if (!shouldalloc) continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Allocate a managed folio for cached I/O, or it may be + * then filled with a file-backed folio for in-place I/O */ - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) + newfolio = filemap_alloc_folio(gfp, 0, NULL); + if (!newfolio) continue; - set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); + newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; + folio = NULL; } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, - tagptr_cast_ptr(t))) + spin_lock(&pcl->lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = + folio_page(folio ?: newfolio, 0); + spin_unlock(&pcl->lockref.lock); continue; - - if (page) - put_page(page); - else if (newpage) - erofs_pagepool_add(pagepool, newpage); + } + spin_unlock(&pcl->lockref.lock); + folio_put(folio ?: newfolio); } /* - * don't do inplace I/O if all compressed pages are available in - * managed cache since it can be moved to the bypass queue instead. + * Don't perform in-place I/O if all compressed pages are available in + * the managed cache, as the pcluster can be moved to the bypass queue. */ - if (standalone) + if (may_bypass) fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } -/* called by erofs_shrinker to get rid of all compressed_pages */ -int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) +/* (erofs_shrinker) disconnect cached encoded data with pclusters */ +static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + struct folio *folio; int i; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - /* - * refcount of workgroup is now freezed as 1, - * therefore no need to worry about available decompression users. - */ - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_bvecs[i].page; - - if (!page) - continue; - - /* block other users from reclaiming or migrating the page */ - if (!trylock_page(page)) - return -EBUSY; - - if (!erofs_page_is_managed(sbi, page)) - continue; + DBG_BUGON(pcl->from_meta); + /* Each cached folio contains one page unless bs > ps is supported */ + for (i = 0; i < pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page) { + folio = page_folio(pcl->compressed_bvecs[i].page); + /* Avoid reclaiming or migrating this folio */ + if (!folio_trylock(folio)) + return -EBUSY; - /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - detach_page_private(page); - unlock_page(page); + if (!erofs_folio_is_managed(sbi, folio)) + continue; + pcl->compressed_bvecs[i].page = NULL; + folio_detach_private(folio); + folio_unlock(folio); + } } return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + struct z_erofs_bvec *bvec = pcl->compressed_bvecs; + struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl); + bool ret; - if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + if (!folio_test_private(folio)) + return true; - ret = 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { - WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; - break; + ret = false; + spin_lock(&pcl->lockref.lock); + if (pcl->lockref.count <= 0) { + DBG_BUGON(pcl->from_meta); + for (; bvec < end; ++bvec) { + if (bvec->page && page_folio(bvec->page) == folio) { + bvec->page = NULL; + folio_detach_private(folio); + ret = true; + break; + } } } - erofs_workgroup_unfreeze(&pcl->obj, 1); - if (ret) - detach_page_private(page); + spin_unlock(&pcl->lockref.lock); return ret; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) { - struct z_erofs_pcluster *const pcl = fe->pcl; + const size_t stop = length + offset; - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, 0)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int z_erofs_init_super(struct super_block *sb) +{ + struct inode *inode; + int err; + + err = z_erofs_init_pcpu_workers(sb); + if (err) + return err; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + EROFS_SB(sb)->managed_cache = inode; + xa_init(&EROFS_SB(sb)->managed_pslots); + return 0; } /* callers must be with pcluster lock held */ -static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, +static int z_erofs_attach_page(struct z_erofs_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { - /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) - return 0; + /* Inplace I/O is limited to one page for uncompressed data */ + if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX || + fe->icur <= 1) { + /* Try to prioritize inplace I/O here */ + spin_lock(&pcl->lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->lockref.lock); + return 0; + } + spin_unlock(&pcl->lockref.lock); + } + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) +static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *pcl = f->pcl; - z_erofs_next_pcluster_t *owned_head = &f->owned_head; - - /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) == Z_EROFS_PCLUSTER_NIL) { - *owned_head = &pcl->next; - /* so we can attach this pcluster to our submission chain. */ - f->mode = Z_EROFS_PCLUSTER_FOLLOWED; - return; - } + if (lockref_get_not_zero(&pcl->lockref)) + return true; - /* - * type 2, link to the end of an existing open chain, be careful - * that its submission is controlled by the original attached chain. - */ - if (*owned_head != &pcl->next && pcl != f->tailpcl && - cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - *owned_head) == Z_EROFS_PCLUSTER_TAIL) { - *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = Z_EROFS_PCLUSTER_HOOKED; - f->tailpcl = NULL; - return; + spin_lock(&pcl->lockref.lock); + if (__lockref_is_dead(&pcl->lockref)) { + spin_unlock(&pcl->lockref.lock); + return false; } - /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = Z_EROFS_PCLUSTER_INFLIGHT; + + if (!pcl->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&pcl->lockref.lock); + return true; } -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) +static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; - bool ztailpacking = map->m_flags & EROFS_MAP_META; - struct z_erofs_pcluster *pcl; - struct erofs_workgroup *grp; + struct super_block *sb = fe->inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct z_erofs_pcluster *pcl, *pre; + unsigned int pageofs_in; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pageofs_in = erofs_blkoff(sb, map->m_pa); + pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); - atomic_set(&pcl->obj.refcount, 1); + lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; + pcl->pclustersize = map->m_plen; pcl->length = 0; pcl->partial = true; - - /* new pclusters should be claimed as type 1, primary and followed */ - pcl->next = fe->owned_head; + pcl->next = fe->head; + pcl->pos = map->m_pa; + pcl->pageofs_in = pageofs_in; pcl->pageofs_out = map->m_la & ~PAGE_MASK; + pcl->from_meta = map->m_flags & EROFS_MAP_META; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* @@ -525,31 +769,29 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); - if (ztailpacking) { - pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(map->m_pa); - pcl->tailpacking_size = map->m_plen; - } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; - - grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; + if (!pcl->from_meta) { + while (1) { + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos, + NULL, pcl, GFP_KERNEL); + if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { + xa_unlock(&sbi->managed_pslots); + break; + } + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); } - - if (grp != &pcl->obj) { - fe->pcl = container_of(grp, - struct z_erofs_pcluster, obj); + if (xa_is_err(pre)) { + err = xa_err(pre); + goto err_out; + } else if (pre) { + fe->pcl = pre; err = -EEXIST; goto err_out; } } - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; - fe->owned_head = &pcl->next; - fe->pcl = pcl; + fe->head = fe->pcl = pcl; return 0; err_out: @@ -558,28 +800,33 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; - struct erofs_workgroup *grp = NULL; + struct super_block *sb = fe->inode->i_sb; + struct z_erofs_pcluster *pcl = NULL; + void *ptr; int ret; DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(!fe->head); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); - } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; + while (1) { + rcu_read_lock(); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); + if (!pcl || z_erofs_get_pcluster(pcl)) { + DBG_BUGON(pcl && map->m_pa != pcl->pos); + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + } } - if (grp) { - fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (pcl) { + fe->pcl = pcl; ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); @@ -587,249 +834,268 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = fe->pcl; - - z_erofs_try_to_claim_pcluster(fe); + /* check if this pcluster hasn't been linked into any chain. */ + if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) { + /* .. so it can be attached to our submission chain */ + fe->head = fe->pcl; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + } else { /* otherwise, it belongs to an inflight chain */ + fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; + } } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!fe->pcl->from_meta) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + ret = erofs_init_metabuf(&map->buf, sb, + erofs_inode_in_metabox(fe->inode)); + if (ret) + return ret; + ptr = erofs_bread(&map->buf, map->m_pa, false); + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + erofs_err(sb, "failed to get inline folio %d", ret); + return ret; + } + folio_get(page_folio(map->buf.page)); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } -/* - * keep in mind that no referenced pclusters will be freed - * only after a RCU grace period. - */ static void z_erofs_rcu_callback(struct rcu_head *head) { - z_erofs_free_pcluster(container_of(head, - struct z_erofs_pcluster, rcu)); + z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu)); } -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + if (pcl->lockref.count) + return false; + + /* + * Note that all cached folios should be detached before deleted from + * the XArray. Otherwise some folios could be still attached to the + * orphan old pcluster when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_folios(sbi, pcl)) + return false; - call_rcu(&pcl->rcu, z_erofs_rcu_callback); + /* + * It's impossible to fail after the pcluster is freezed, but in order + * to avoid some race conditions, add a DBG_BUGON to observe this. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl); + + lockref_mark_dead(&pcl->lockref); + return true; } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) +{ + bool free; + + spin_lock(&pcl->lockref.lock); + free = __erofs_try_to_release_pcluster(sbi, pcl); + spin_unlock(&pcl->lockref.lock); + if (free) { + atomic_long_dec(&erofs_global_shrink_cnt); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); + } + return free; +} + +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr) +{ + struct z_erofs_pcluster *pcl; + unsigned long index, freed = 0; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, pcl) { + /* try to shrink each valid pcluster */ + if (!erofs_try_to_release_pcluster(sbi, pcl)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; +} + +static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, bool try_free) +{ + bool free = false; + + if (lockref_put_or_lock(&pcl->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&pcl->lockref)); + if (!--pcl->lockref.count) { + if (try_free && xa_trylock(&sbi->managed_pslots)) { + free = __erofs_try_to_release_pcluster(sbi, pcl); + xa_unlock(&sbi->managed_pslots); + } + atomic_long_add(!free, &erofs_global_shrink_cnt); + } + spin_unlock(&pcl->lockref.lock); + if (free) + call_rcu(&pcl->rcu, z_erofs_rcu_callback); +} + +static void z_erofs_pcluster_end(struct z_erofs_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } - /* - * if all pending pages are added, don't hold its reference - * any longer if the pcluster isn't hosted by ourselves. - */ + /* Drop refcount if it doesn't belong to our processing chain */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) - erofs_workgroup_put(&pcl->obj); - + z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); fe->pcl = NULL; - return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, - EROFS_BLKSIZ - erofs_blkoff(pos)); - src = erofs_bread(&buf, packed_inode, - erofs_blknr(pos), EROFS_KMAP); + buf.mapping = packed_inode->i_mapping; + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); + src = erofs_bread(&buf, pos, true); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_folio(folio, cur, src, cnt); } erofs_put_metabuf(&buf); return 0; } -static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) +static int z_erofs_scan_folio(struct z_erofs_frontend *f, + struct folio *folio, bool ra) { - struct inode *const inode = fe->inode; - struct erofs_map_blocks *const map = &fe->map; - const loff_t offset = page_offset(page); - bool tight = true, exclusive; - unsigned int cur, end, spiltted; + struct inode *const inode = f->inode; + struct erofs_map_blocks *const map = &f->map; + const loff_t offset = folio_pos(folio); + const unsigned int bs = i_blocksize(inode); + unsigned int end = folio_size(folio), split = 0, cur, pgs; + bool tight, excl; int err = 0; - /* register locked file pages as online pages in pack */ - z_erofs_onlinepage_init(page); - - spiltted = 0; - end = PAGE_SIZE; -repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - erofs_dbg("out-of-range map @ pos %llu", offset + cur); - - if (z_erofs_collector_end(fe)) - fe->backmost = false; - map->m_la = offset + cur; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ - } - - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_collector_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(map->m_pa), EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); - goto out; + tight = (bs == PAGE_SIZE); + erofs_onlinefolio_init(folio); + do { + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + z_erofs_pcluster_end(f); + map->m_la = offset + end - 1; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + break; } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && - fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - cur = end - min_t(unsigned int, offset + end - map->m_la, end); - if (!(map->m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, cur, end); - goto next_part; - } - if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; - - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; + cur = offset > map->m_la ? 0 : map->m_la - offset; + pgs = round_down(cur, PAGE_SIZE); + /* bump split parts first to avoid several separate cases */ + ++split; + + if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, end); + tight = false; + } else if (map->m_flags & __EROFS_MAP_FRAGMENT) { + erofs_off_t fpos = offset + cur - map->m_la; + + err = z_erofs_read_fragment(inode->i_sb, folio, cur, + cur + min(map->m_llen - fpos, end - cur), + EROFS_I(inode)->z_fragmentoff + fpos); + if (err) + break; + tight = false; } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); - if (err) - goto out; - ++spiltted; - tight = false; - goto next_part; - } - - exclusive = (!cur && (!spiltted || tight)); - if (cur) - tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); - -retry: - err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { - .page = page, - .offset = offset - map->m_la, - .end = end, - }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); - goto out; - } - - z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; - if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) - fe->pcl->multibases = true; - if (fe->pcl->length < offset + end - map->m_la) { - fe->pcl->length = offset + end - map->m_la; - fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; - } - if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && - !(map->m_flags & EROFS_MAP_PARTIAL_REF) && - fe->pcl->length == map->m_llen) - fe->pcl->partial = false; -next_part: - /* shorten the remaining extent to update progress */ - map->m_llen = offset + cur - map->m_la; - map->m_flags &= ~EROFS_MAP_FULL_MAPPED; - - end = cur; - if (end > 0) - goto repeat; + if (!f->pcl) { + err = z_erofs_pcluster_begin(f); + if (err) + break; + f->pcl->besteffort |= !ra; + } -out: - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + pgs = round_down(end - 1, PAGE_SIZE); + /* + * Ensure this partial page belongs to this submit chain + * rather than other concurrent submit chains or + * noio(bypass) chains since those chains are handled + * asynchronously thus it cannot be used for inplace I/O + * or bvpage (should be processed in the strict order.) + */ + tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED); + excl = false; + if (cur <= pgs) { + excl = (split <= 1) || tight; + cur = pgs; + } - erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", - __func__, page, spiltted, map->m_llen); + err = z_erofs_attach_page(f, &((struct z_erofs_bvec) { + .page = folio_page(folio, pgs >> PAGE_SHIFT), + .offset = offset + pgs - map->m_la, + .end = end - pgs, }), excl); + if (err) + break; + + erofs_onlinefolio_split(folio); + if (f->pcl->length < offset + end - map->m_la) { + f->pcl->length = offset + end - map->m_la; + f->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + f->pcl->length == map->m_llen) + f->pcl->partial = false; + } + /* shorten the remaining extent to update progress */ + map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; + if (cur <= pgs) { + split = cur < pgs; + tight = (bs == PAGE_SIZE); + } + } while ((end = cur) > 0); + erofs_onlinefolio_end(folio, err, false); return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -846,14 +1112,13 @@ static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, static bool z_erofs_page_is_invalidated(struct page *page) { - return !page->mapping && !z_erofs_is_shortlived_page(page); + return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page); } -struct z_erofs_decompress_backend { +struct z_erofs_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; - /* pages with the longest decompressed length for deduplication */ struct page **decompressed_pages; /* pages to keep the compressed data */ @@ -862,6 +1127,8 @@ struct z_erofs_decompress_backend { struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; + /* indicate if temporary copies should be preserved for later use */ + bool keepxcpy; }; struct z_erofs_bvec_item { @@ -869,20 +1136,23 @@ struct z_erofs_bvec_item { struct list_head list; }; -static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, +static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be, struct z_erofs_bvec *bvec) { + int poff = bvec->offset + be->pcl->pageofs_out; struct z_erofs_bvec_item *item; - - if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { - unsigned int pgnr; - - pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= be->nr_pages); - if (!be->decompressed_pages[pgnr]) { - be->decompressed_pages[pgnr] = bvec->page; + struct page **page; + + if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || + bvec->offset + bvec->end == be->pcl->length)) { + DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages); + page = be->decompressed_pages + (poff >> PAGE_SHIFT); + if (!*page) { + *page = bvec->page; return; } + } else { + be->keepxcpy = true; } /* (cold path) one pcluster is requested multiple times */ @@ -891,8 +1161,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, list_add(&item->list, &be->decompressed_secondary_bvecs); } -static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, - int err) +static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err) { unsigned int off0 = be->pcl->pageofs_out; struct list_head *p, *n; @@ -927,15 +1196,13 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - if (err) - z_erofs_page_mark_eio(bvi->bvec.page); - z_erofs_onlinepage_endio(bvi->bvec.page); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true); list_del(p); kfree(bvi); } } -static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; @@ -960,8 +1227,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); } -static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, - bool *overlapped) +static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); @@ -972,46 +1238,42 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - /* compressed pages ought to be present before decompressing */ - if (!page) { - DBG_BUGON(1); + /* compressed data ought to be valid when decompressing */ + if (IS_ERR(page) || !page) { + bvec->page = NULL; /* clear the failure reason */ + err = page ? PTR_ERR(page) : -EIO; continue; } be->compressed_pages[i] = page; - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta || + erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; continue; } DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - z_erofs_do_decompressed_bvec(be, bvec); - *overlapped = true; - } + if (z_erofs_is_shortlived_page(page)) + continue; + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; } - - if (err) - return err; - return 0; + return err; } -static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, - int err) +static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) { struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - unsigned int i, inputsize; - int err2; + const struct z_erofs_decompressor *alg = + z_erofs_decomp[pcl->algorithmformat]; + bool try_free = true; + int i, j, jtop, err2; struct page *page; bool overlapped; + const char *reason; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; @@ -1032,115 +1294,133 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (!be->decompressed_pages) be->decompressed_pages = - kcalloc(be->nr_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = - kcalloc(pclusterpages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + if (!err) { + reason = alg->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, + .inpages = pclusterpages, + .outpages = be->nr_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, - .fillgaps = pcl->multibases, + .fillgaps = be->keepxcpy, + .gfp = pcl->besteffort ? GFP_KERNEL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); + if (IS_ERR(reason)) { + erofs_err(be->sb, "failed to decompress (%s) %ld @ pa %llu size %u => %u", + alg->name, PTR_ERR(reason), pcl->pos, + pcl->pclustersize, pcl->length); + err = PTR_ERR(reason); + } else if (unlikely(reason)) { + erofs_err(be->sb, "failed to decompress (%s) %s @ pa %llu size %u => %u", + alg->name, reason, pcl->pos, + pcl->pclustersize, pcl->length); + err = -EFSCORRUPTED; + } + } -out: /* must handle all compressed pages before actual file pages */ - if (z_erofs_is_inline_pcluster(pcl)) { - page = pcl->compressed_bvecs[0].page; + if (pcl->from_meta) { + folio_put(page_folio(pcl->compressed_bvecs[0].page)); WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); - put_page(page); } else { + /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_bvecs[i].page; - - if (erofs_page_is_managed(sbi, page)) + page = be->compressed_pages[i]; + if (!page) continue; - - /* recycle all individual short-lived pages */ + if (erofs_folio_is_managed(sbi, page_folio(page))) { + try_free = false; + continue; + } (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) - kfree(be->compressed_pages); - z_erofs_fill_other_copies(be, err); + kvfree(be->compressed_pages); + jtop = 0; + z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); - - /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(be->pagepool, page)) + if (!z_erofs_is_shortlived_page(page)) { + erofs_onlinefolio_end(page_folio(page), err, true); + continue; + } + if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { + erofs_pagepool_add(be->pagepool, page); continue; - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + } + for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j) + ; + if (j >= jtop) /* this bounce page is newly detected */ + be->decompressed_pages[jtop++] = page; } - + while (jtop) + erofs_pagepool_add(be->pagepool, + be->decompressed_pages[--jtop]); if (be->decompressed_pages != be->onstack_pages) - kfree(be->decompressed_pages); + kvfree(be->decompressed_pages); pcl->length = 0; pcl->partial = true; - pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); + + if (pcl->from_meta) + z_erofs_free_pcluster(pcl); + else + z_erofs_put_pcluster(sbi, pcl, try_free); return err; } -static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct page **pagepool) +static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct page **pagepool) { - struct z_erofs_decompress_backend be = { + struct z_erofs_backend be = { .sb = io->sb, .pagepool = pagepool, .decompressed_secondary_bvecs = LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + .pcl = io->head, }; - z_erofs_next_pcluster_t owned = io->head; + struct z_erofs_pcluster *next; + int err = io->eio ? -EIO : 0; - while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ - DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ - DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - - be.pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(be.pcl->next); - - z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); - erofs_workgroup_put(&be.pcl->obj); + for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) { + DBG_BUGON(!be.pcl); + next = READ_ONCE(be.pcl->next); + err = z_erofs_decompress_pcluster(&be, err) ?: err; } + return err; } static void z_erofs_decompressqueue_work(struct work_struct *work) @@ -1149,20 +1429,36 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) container_of(work, struct z_erofs_decompressqueue, u.work); struct page *pagepool = NULL; - DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); z_erofs_decompress_queue(bgq, &pagepool); - erofs_release_pages(&pagepool); kvfree(bgq); } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + +/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */ +static inline bool z_erofs_in_atomic(void) +{ + if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth()) + return true; + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return true; + return !preemptible(); +} + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) + int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ - if (sync) { + if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; @@ -1170,9 +1466,23 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use workqueue and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { + if (z_erofs_in_atomic()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else queue_work(z_erofs_workqueue, &io->u.work); +#endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; @@ -1181,122 +1491,112 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *oldpage, *page; - - compressed_page_t t; - int justfound; + struct folio *folio; + struct page *page; + int bs = i_blocksize(f->inode); + /* Except for inplace folios, the entire folio can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) - goto out_allocpage; - - /* process the target tagged pointer */ - t = tagptr_init(compressed_page_t, page); - justfound = tagptr_unfold_tags(t); - page = tagptr_unfold_ptr(t); - - /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. - */ - if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - set_page_private(page, 0); + spin_lock(&pcl->lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + spin_unlock(&pcl->lockref.lock); + if (!zbv.page) + goto out_allocfolio; + + bvec->bv_page = zbv.page; + DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page)); + + folio = page_folio(zbv.page); + /* For preallocated managed folios, add them to page cache here */ + if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) { tocache = true; goto out_tocache; } - mapping = READ_ONCE(page->mapping); + mapping = READ_ONCE(folio->mapping); /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed folios for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; - - lock_page(page); - - /* only true if page reclaim goes wrong, should never happen */ - DBG_BUGON(justfound && PagePrivate(page)); - - /* the page is still in manage cache */ - if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - - if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ - DBG_BUGON(!justfound); + if (mapping && mapping != mc) { + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; + return; + } - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + folio_lock(folio); + if (likely(folio->mapping == mc)) { + /* + * The cached folio is still in managed cache but without + * a valid `->private` pcluster hint. Let's reconnect them. + */ + if (!folio_test_private(folio)) { + folio_attach_private(folio, pcl); + /* compressed_bvecs[] already takes a ref before */ + folio_put(folio); } - - /* no need to submit io if it is already up-to-date */ - if (PageUptodate(page)) { - unlock_page(page); - page = NULL; + if (likely(folio->private == pcl)) { + /* don't submit cache I/Os again if already uptodate */ + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + bvec->bv_page = NULL; + } + return; } - goto out; + /* + * Already linked with another pcluster, which only appears in + * crafted images by fuzzers for now. But handle this anyway. + */ + tocache = false; /* use temporary short-lived pages */ + } else { + DBG_BUGON(1); /* referenced managed folios can't be truncated */ + tocache = true; } - - /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. - */ - DBG_BUGON(page->mapping); - DBG_BUGON(!justfound); - - tocache = true; - unlock_page(page); - put_page(page); -out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + folio_unlock(folio); + folio_put(folio); +out_allocfolio: + page = __erofs_allocpage(&f->pagepool, gfp, true); + spin_lock(&pcl->lockref.lock); + if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { + if (page) + erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); + spin_unlock(&pcl->lockref.lock); + bvec->bv_page = page; + if (!page) + return; + folio = page_folio(page); out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ - set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + if (!tocache || bs != PAGE_SIZE || + filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) { + /* turn into a temporary shortlived folio (1 ref) */ + folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; + return; } - attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ - put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; + folio_attach_private(folio, pcl); + /* drop a refcount added by allocpage (then 2 refs in total here) */ + folio_put(folio); } -static struct z_erofs_decompressqueue * -jobqueue_init(struct super_block *sb, - struct z_erofs_decompressqueue *fgq, bool *fg) +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; @@ -1306,16 +1606,22 @@ jobqueue_init(struct super_block *sb, *fg = true; goto fg_out; } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; + q->sync = true; } q->sb = sb; - q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + q->head = Z_EROFS_PCLUSTER_TAIL; return q; } @@ -1326,133 +1632,102 @@ enum { NR_JOBQUEUES, }; -static void *jobqueueset_init(struct super_block *sb, - struct z_erofs_decompressqueue *q[], - struct z_erofs_decompressqueue *fgq, bool *fg) +static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl, + struct z_erofs_pcluster *next, + struct z_erofs_pcluster **qtail[]) { - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); -} - -static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, - z_erofs_next_pcluster_t qtail[], - z_erofs_next_pcluster_t owned_head) -{ - z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; - z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; - - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (owned_head == Z_EROFS_PCLUSTER_TAIL) - owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; - - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); - - WRITE_ONCE(*submit_qtail, owned_head); - WRITE_ONCE(*bypass_qtail, &pcl->next); - + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); + WRITE_ONCE(*qtail[JQ_SUBMIT], next); + WRITE_ONCE(*qtail[JQ_BYPASS], pcl); qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_endio(struct bio *bio) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(z_erofs_page_is_invalidated(page)); + DBG_BUGON(folio_test_uptodate(folio)); + DBG_BUGON(z_erofs_page_is_invalidated(&folio->page)); + if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio)) + continue; - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } + if (!err) + folio_mark_uptodate(folio); + folio_unlock(folio); } if (err) q->eio = true; - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); - bio_put(bio); + z_erofs_decompress_kickoff(q, -1); + if (bio->bi_bdev) + bio_put(bio); } -static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, +static void z_erofs_submit_queue(struct z_erofs_frontend *f, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); - z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_pcluster **qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - void *bi_private; - z_erofs_next_pcluster_t owned_head = f->owned_head; + struct z_erofs_pcluster *pcl, *next; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; - struct block_device *last_bdev; + erofs_off_t last_pa; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - bi_private = jobqueueset_init(sb, q, fgq, force_fg); + /* No need to read from device for pclusters in the bypass queue. */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ - q[JQ_SUBMIT]->head = owned_head; + q[JQ_SUBMIT]->head = next = f->head; do { struct erofs_map_dev mdev; - struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; - /* no possible 'owned_head' equals the following */ - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); - - pcl = container_of(owned_head, struct z_erofs_pcluster, next); - - /* close the main owned chain at first */ - owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (z_erofs_is_inline_pcluster(pcl)) { - move_to_bypass_jobqueue(pcl, qtail, owned_head); + pcl = next; + next = READ_ONCE(pcl->next); + if (pcl->from_meta) { + z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = blknr_to_addr(pcl->obj.index), + .m_pa = round_down(pcl->pos, sb->s_blocksize), }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = round_up(cur + pcl->pageofs_in + pcl->pclustersize, + sb->s_blocksize); do { - struct page *page; - - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); - if (!page) - continue; + bvec.bv_page = NULL; + if (bio && (cur != last_pa || + bio->bi_bdev != mdev.m_bdev)) { +drain_io: + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) + erofs_fscache_submit_bio(bio); + else + submit_bio(bio); - if (bio && (cur != last_index + 1 || - last_bdev != mdev.m_bdev)) { -submit_bio_retry: - submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); memstall = 0; @@ -1460,132 +1735,144 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (!bvec.bv_page) { + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) + continue; + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); + } + + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } if (!bio) { - bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, - REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; - if (f->readahead) + if (erofs_is_fileio_mode(EROFS_SB(sb))) + bio = erofs_fileio_bio_alloc(&mdev); + else if (erofs_is_fscache_mode(sb)) + bio = erofs_fscache_bio_alloc(&mdev); + else + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); + bio->bi_end_io = z_erofs_endio; + bio->bi_iter.bi_sector = + (mdev.m_dif->fsoff + cur) >> 9; + bio->bi_private = q[JQ_SUBMIT]; + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) - goto submit_bio_retry; - - last_index = cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) + goto drain_io; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else - move_to_bypass_jobqueue(pcl, qtail, owned_head); - } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + z_erofs_move_to_bypass_queue(pcl, next, qtail); + } while (next != Z_EROFS_PCLUSTER_TAIL); if (bio) { - submit_bio(bio); - if (memstall) - psi_memstall_leave(&pflags); + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) + erofs_fscache_submit_bio(bio); + else + submit_bio(bio); } + if (memstall) + psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. + * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } - z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) +static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; + struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); + bool force_fg = z_erofs_is_sync_decompress(sbi, rapages); + int err; - if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) - return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + if (f->head == Z_EROFS_PCLUSTER_TAIL) + return 0; + z_erofs_submit_queue(f, io, &force_fg, !!rapages); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); - + err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) - return; + return err; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err; } /* * Since partial uptodate is still unimplemented for now, we have to use * approximate readmore strategies as a start. */ -static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) +static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); - if (err) + if (err || !(map->m_flags & EROFS_MAP_ENCODED)) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); - - if (!map->m_llen) + if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen) return; } cur = map->m_la + map->m_llen - 1; - while (cur >= end) { + while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; - struct page *page; - - page = erofs_grab_cache_page_nowait(inode->i_mapping, index); - if (page) { - if (PageUptodate(page)) { - unlock_page(page); - } else { - err = z_erofs_do_read_page(f, page, pagepool); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - } - put_page(page); + struct folio *folio; + + folio = erofs_grab_folio_nowait(inode->i_mapping, index); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_uptodate(folio)) + folio_unlock(folio); + else + z_erofs_scan_folio(f, folio, !!rac); + folio_put(folio); } if (cur < PAGE_SIZE) @@ -1596,77 +1883,58 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *const inode = page->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; + struct inode *const inode = folio->mapping->host; + Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio)); int err; - trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); - - (void)z_erofs_collector_end(&f); - - /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + trace_erofs_read_folio(folio, false); + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_scan_folio(&f, folio, false); + z_erofs_pcluster_readmore(&f, NULL, false); + z_erofs_pcluster_end(&f); - if (err) - erofs_err(inode->i_sb, "failed to read, err [%d]", err); + /* if some pclusters are ready, need submit them anyway */ + err = z_erofs_runqueue(&f, 0) ?: err; + if (err && err != -EINTR) + erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; - unsigned int nr_pages; - - f.readahead = true; - f.headoffset = readahead_pos(rac); - - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); - nr_pages = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); - - while ((page = readahead_page(rac))) { - set_page_private(page, (unsigned long)head); - head = page; + Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac)); + unsigned int nrpages = readahead_count(rac); + struct folio *head = NULL, *folio; + int err; + + trace_erofs_readahead(inode, readahead_index(rac), nrpages, false); + z_erofs_pcluster_readmore(&f, rac, true); + while ((folio = readahead_folio(rac))) { + folio->private = head; + head = folio; } + /* traverse in reverse order for best metadata I/O performance */ while (head) { - struct page *page = head; - int err; - - /* traversal in reverse order */ - head = (void *)page_private(page); - - err = z_erofs_do_read_page(&f, page, &pagepool); - if (err) - erofs_err(inode->i_sb, - "readahead error at page %lu @ nid %llu", - page->index, EROFS_I(inode)->nid); - put_page(page); + folio = head; + head = folio_get_private(folio); + + err = z_erofs_scan_folio(&f, folio, true); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_readmore(&f, rac, false); + z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + (void)z_erofs_runqueue(&f, nrpages); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h deleted file mode 100644 index d98c95212985..000000000000 --- a/fs/erofs/zdata.h +++ /dev/null @@ -1,178 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZDATA_H -#define __EROFS_FS_ZDATA_H - -#include "internal.h" -#include "tagptr.h" - -#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_INLINE_BVECS 2 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_bvec { - struct page *page; - int offset; - unsigned int end; -}; - -#define __Z_EROFS_BVSET(name, total) \ -struct name { \ - /* point to the next page which contains the following bvecs */ \ - struct page *nextpage; \ - struct z_erofs_bvec bvec[total]; \ -} -__Z_EROFS_BVSET(z_erofs_bvset,); -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); - -/* - * Structure fields follow one of the following exclusion rules. - * - * I: Modifiable by initialization/destruction paths and read-only - * for everyone else; - * - * L: Field should be protected by the pcluster lock; - * - * A: Field should be accessed / updated in atomic for parallelized code. - */ -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct mutex lock; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* L: the maximum decompression size of this round */ - unsigned int length; - - /* L: total number of bvecs */ - unsigned int vcnt; - - /* I: page offset of start position of decompression */ - unsigned short pageofs_out; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; - - union { - /* L: inline a certain number of bvec for bootstrap */ - struct z_erofs_bvset_inline bvset; - - /* I: can be used to free the pcluster by RCU. */ - struct rcu_head rcu; - }; - - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - - /* I: compression algorithm format */ - unsigned char algorithmformat; - - /* L: whether partial decompression or not */ - bool partial; - - /* L: indicate several pageofs_outs or not */ - bool multibases; - - /* A: compressed bvecs (can be cached or inplaced pages) */ - struct z_erofs_bvec compressed_bvecs[]; -}; - -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - -#define Z_EROFS_PCLUSTER_NIL (NULL) - -struct z_erofs_decompressqueue { - struct super_block *sb; - atomic_t pending_bios; - z_erofs_next_pcluster_t head; - - union { - struct completion done; - struct work_struct work; - } u; - - bool eio; -}; - -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->obj.index; -} - -static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) -{ - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; -} - -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static inline void z_erofs_page_mark_eio(struct page *page) -{ - int orig; - - do { - orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; - - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - -#define Z_EROFS_ONSTACK_PAGES 32 - -#endif diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98fb90b9af71..c8d8e129eb4b 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -4,156 +4,12 @@ * https://www.huawei.com/ */ #include "internal.h" -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <trace/events/erofs.h> -static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags); - -int z_erofs_fill_inode(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { - vi->z_advise = 0; - vi->z_algorithmtype[0] = 0; - vi->z_algorithmtype[1] = 0; - vi->z_logical_clusterbits = LOG_BLOCK_SIZE; - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); - } - inode->i_mapping->a_ops = &z_erofs_aops; - return 0; -} - -static int z_erofs_fill_inode_lazy(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - struct z_erofs_map_header *h; - - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { - /* - * paired with smp_mb() at the end of the function to ensure - * fields will only be observed after the bit is set. - */ - smp_mb(); - return 0; - } - - if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) - return -ERESTARTSYS; - - err = 0; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) - goto out_unlock; - - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + - vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); - goto out_unlock; - } - - h = kaddr + erofs_blkoff(pos); - /* - * if the highest bit of the 8-byte map header is set, the whole file - * is stored in the packed inode. The rest bits keeps z_fragmentoff. - */ - if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { - vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; - vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); - vi->z_tailextent_headlcn = 0; - goto done; - } - vi->z_advise = le16_to_cpu(h->h_advise); - vi->z_algorithmtype[0] = h->h_algorithmtype & 15; - vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_put_metabuf; - } - - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | - Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - if (err < 0) - goto out_put_metabuf; - } -done: - /* paired with smp_mb() at the beginning of the function */ - smp_mb(); - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -out_put_metabuf: - erofs_put_metabuf(&buf); -out_unlock: - clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); - return err; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; - void *kaddr; - unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; @@ -161,148 +17,159 @@ struct z_erofs_maprecorder { u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; - bool partialref; + bool partialref, in_mbox; }; -static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) +static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); - const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + - vi->xattr_isize) + - lcn * sizeof(struct z_erofs_vle_decompressed_index); - struct z_erofs_vle_decompressed_index *di; - unsigned int advise, type; - - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - - m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); + const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_lcluster_index); + struct z_erofs_lcluster_index *di; + unsigned int advise; + + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox); + if (IS_ERR(di)) + return PTR_ERR(di); m->lcn = lcn; - di = m->kaddr + erofs_blkoff(pos); + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); - type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & - ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); - switch (type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: - m->clusterofs = 1 << vi->z_logical_clusterbits; + m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + m->clusterofs = 1 << vi->z_lclusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); - if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = m->delta[0] & - ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); - break; - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) - m->partialref = true; + } else { + m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); - break; - default: - DBG_BUGON(1); - return -EOPNOTSUPP; } - m->type = type; return 0; } static unsigned int decode_compactedbits(unsigned int lobits, - unsigned int lomask, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); - const unsigned int lo = v & lomask; + const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } -static int get_compacted_la_distance(unsigned int lclusterbits, +static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { - const unsigned int lomask = (1 << lclusterbits) - 1; unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); - if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; ++d1; } while (++i < vcnt); - /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */ - if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT)) + /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_LI_D0_CBLKCNT)) d1 += lo - 1; return d1; } -static int unpack_compacted_index(struct z_erofs_maprecorder *m, - unsigned int amortizedshift, - erofs_off_t pos, bool lookahead) +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) { - struct erofs_inode *const vi = EROFS_I(m->inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; - const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk, eofs; - int i; + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize); + const unsigned int lclusterbits = vi->z_lclusterbits; + const unsigned int totalidx = erofs_iblks(inode); + unsigned int compacted_4b_initial, compacted_2b, amortizedshift; + unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; + bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + erofs_off_t pos; u8 *in, type; - bool big_pcluster; + int i; - if (1 << amortizedshift == 4) + if (lcn >= totalidx || lclusterbits > 14) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = ((32 - ebase % 32) / 4) & 7; + compacted_2b = 0; + if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && + compacted_4b_initial < totalidx) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + + pos = ebase; + amortizedshift = 2; /* compact_4b */ + if (lcn >= compacted_4b_initial) { + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + if (lcn < compacted_2b) { + amortizedshift = 1; + } else { + pos += compacted_2b * 2; + lcn -= compacted_2b; + } + } + pos += lcn * (1 << amortizedshift); + + /* figure out the lcluster count in this pack */ + if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits == 12) + else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; + in = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox); + if (IS_ERR(in)) + return PTR_ERR(in); + /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); - big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; - eofs = erofs_blkoff(pos); - base = round_down(eofs, vcnt << amortizedshift); - in = m->kaddr + base; + bytes = pos & ((vcnt << amortizedshift) - 1); + in -= bytes; + i = bytes >> amortizedshift; - i = (eofs - base) >> amortizedshift; - - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) - m->delta[1] = get_compacted_la_distance(lclusterbits, + m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); - if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; return 0; } else if (i + 1 != (int)vcnt) { @@ -314,11 +181,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * (i - 1), &type); - if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + lo = decode_compactedbits(lobits, in, + encodebits * (i - 1), &type); + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; - else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) + else if (lo & Z_EROFS_LI_D0_CBLKCNT) lo = 1; m->delta[0] = lo + 1; return 0; @@ -330,9 +197,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 1; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; if (i >= 0) @@ -342,12 +209,12 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 0; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { - if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; - nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT; continue; } /* bigpcluster shouldn't have plain d0 == 1 */ @@ -366,119 +233,61 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, return 0; } -static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn, bool lookahead) +static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn, bool lookahead) { - struct inode *const inode = m->inode; - struct erofs_inode *const vi = EROFS_I(inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; - const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + - vi->inode_isize + vi->xattr_isize, 8) + - sizeof(struct z_erofs_map_header); - const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); - unsigned int compacted_4b_initial, compacted_2b; - unsigned int amortizedshift; - erofs_off_t pos; - - if (lclusterbits != 12) - return -EOPNOTSUPP; - - if (lcn >= totalidx) - return -EINVAL; - - m->lcn = lcn; - /* used to align to 32-byte (compacted_2b) alignment */ - compacted_4b_initial = (32 - ebase % 32) / 4; - if (compacted_4b_initial == 32 / 4) - compacted_4b_initial = 0; - - if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && - compacted_4b_initial < totalidx) - compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); - else - compacted_2b = 0; + struct erofs_inode *vi = EROFS_I(m->inode); + int err; - pos = ebase; - if (lcn < compacted_4b_initial) { - amortizedshift = 2; - goto out; + if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) { + err = z_erofs_load_compact_lcluster(m, lcn, lookahead); + } else { + DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL); + err = z_erofs_load_full_lcluster(m, lcn); } - pos += compacted_4b_initial * 4; - lcn -= compacted_4b_initial; + if (err) + return err; - if (lcn < compacted_2b) { - amortizedshift = 1; - goto out; + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { + erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu", + m->type, lcn, EROFS_I(m->inode)->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && + m->clusterofs >= (1 << vi->z_lclusterbits)) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - pos += compacted_2b * 2; - lcn -= compacted_2b; - amortizedshift = 2; -out: - pos += lcn * (1 << amortizedshift); - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - return unpack_compacted_index(m, amortizedshift, pos, lookahead); -} - -static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn, bool lookahead) -{ - const unsigned int datamode = EROFS_I(m->inode)->datalayout; - - if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) - return legacy_load_cluster_from_disk(m, lcn); - - if (datamode == EROFS_INODE_FLAT_COMPRESSION) - return compacted_load_cluster_from_disk(m, lcn, lookahead); - - return -EINVAL; + return 0; } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { + struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lclusterbits = vi->z_lclusterbits; while (m->lcn >= lookback_distance) { unsigned long lcn = m->lcn - lookback_distance; int err; - /* load extent head logical cluster if needed */ - err = z_erofs_load_cluster_from_disk(m, lcn, false); + if (!lookback_distance) + break; + + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; - - switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: - if (!m->delta[0]) { - erofs_err(m->inode->i_sb, - "invalid lookback distance 0 @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; continue; - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - m->headtype = m->type; - m->map->m_la = (lcn << lclusterbits) | m->clusterofs; - return 0; - default: - erofs_err(m->inode->i_sb, - "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } + m->headtype = m->type; + m->map->m_la = (lcn << lclusterbits) | m->clusterofs; + return 0; } - - erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu", - vi->nid); + erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", + lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -486,30 +295,27 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, unsigned int initial_lcn) { - struct erofs_inode *const vi = EROFS_I(m->inode); - struct erofs_map_blocks *const map = m->map; - const unsigned int lclusterbits = vi->z_logical_clusterbits; - unsigned long lcn; + struct inode *inode = m->inode; + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + bool bigpcl1 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + bool bigpcl2 = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2; + unsigned long lcn = m->lcn + 1; int err; - DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); DBG_BUGON(m->type != m->headtype); - if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || - ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { - map->m_plen = 1ULL << lclusterbits; - return 0; - } - lcn = m->lcn + 1; + if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || + m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || + (lcn << vi->z_lclusterbits) >= inode->i_size) + m->compressedblks = 1; + if (m->compressedblks) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; @@ -522,40 +328,23 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, * BUG_ON in the debugging mode only for developers to notice that. */ DBG_BUGON(lcn == initial_lcn && - m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD); + m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); - switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - /* - * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type - * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. - */ - m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE); - break; - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: - if (m->delta[0] != 1) - goto err_bonus_cblkcnt; - if (m->compressedblks) - break; - fallthrough; - default: - erofs_err(m->inode->i_sb, - "cannot found CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD && m->delta[0] != 1) { + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } + + /* + * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type rather + * than CBLKCNT, it's a 1 block-sized pcluster. + */ + if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD || !m->compressedblks) + m->compressedblks = 1; out: - map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE; + m->map->m_plen = erofs_pos(sb, m->compressedblks); return 0; -err_bonus_cblkcnt: - erofs_err(m->inode->i_sb, - "bogus CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) @@ -563,115 +352,96 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) struct inode *inode = m->inode; struct erofs_inode *vi = EROFS_I(inode); struct erofs_map_blocks *map = m->map; - unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned int lclusterbits = vi->z_lclusterbits; u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; - do { + while (1) { /* handle the last EOF pcluster (no next HEAD lcluster) */ if ((lcn << lclusterbits) >= inode->i_size) { map->m_llen = inode->i_size - map->m_la; return 0; } - err = z_erofs_load_cluster_from_disk(m, lcn, true); + err = z_erofs_load_lcluster_from_disk(m, lcn, true); if (err) return err; - if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { - DBG_BUGON(!m->delta[1] && - m->clusterofs != 1 << lclusterbits); - } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { - /* go on until the next HEAD lcluster */ + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + /* work around invalid d1 generated by pre-1.0 mkfs */ + if (unlikely(!m->delta[1])) { + m->delta[1] = 1; + DBG_BUGON(1); + } + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { if (lcn != headlcn) - break; + break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; - } else { - erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } lcn += m->delta[1]; - } while (m->delta[1]); - + } map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; return 0; } -static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags) +static int z_erofs_map_blocks_fo(struct inode *inode, + struct erofs_map_blocks *map, int flags) { - struct erofs_inode *const vi = EROFS_I(inode); - bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + bool ztailpacking = vi->z_idata_size; + unsigned int lclusterbits = vi->z_lclusterbits; struct z_erofs_maprecorder m = { .inode = inode, .map = map, + .in_mbox = erofs_inode_in_metabox(inode), }; - int err = 0; - unsigned int lclusterbits, endoff; + unsigned int endoff; unsigned long initial_lcn; unsigned long long ofs, end; + int err; - lclusterbits = vi->z_logical_clusterbits; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; + if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_FRAGMENT; + return 0; + } initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); + err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; - if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) - vi->z_idataoff = m.nextpackoff; - + if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking) + vi->z_fragmentoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; - switch (m.type) { - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - if (endoff >= m.clusterofs) { - m.headtype = m.type; - map->m_la = (m.lcn << lclusterbits) | m.clusterofs; - /* - * For ztailpacking files, in order to inline data more - * effectively, special EOF lclusters are now supported - * which can have three parts at most. - */ - if (ztailpacking && end > inode->i_size) - end = inode->i_size; - break; - } - /* m.lcn should be >= 1 if endoff < m.clusterofs */ - if (!m.lcn) { - erofs_err(inode->i_sb, - "invalid logical cluster 0 at nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; + if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && endoff >= m.clusterofs) { + m.headtype = m.type; + map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + /* + * For ztailpacking files, in order to inline data more + * effectively, special EOF lclusters are now supported + * which can have three parts at most. + */ + if (ztailpacking && end > inode->i_size) + end = inode->i_size; + } else { + if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + end = (m.lcn << lclusterbits) | m.clusterofs; + map->m_flags |= EROFS_MAP_FULL_MAPPED; + m.delta[0] = 1; } - end = (m.lcn << lclusterbits) | m.clusterofs; - map->m_flags |= EROFS_MAP_FULL_MAPPED; - m.delta[0] = 1; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; - break; - default: - erofs_err(inode->i_sb, - "unknown type %u @ offset %llu of nid %llu", - m.type, ofs, vi->nid); - err = -EOPNOTSUPP; - goto unmap_out; } if (m.partialref) map->m_flags |= EROFS_MAP_PARTIAL_REF; @@ -680,36 +450,39 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ - if (fragment && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; - map->m_pa = vi->z_idataoff; + map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { - map->m_flags |= EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; } else { - map->m_pa = blknr_to_addr(m.pblk); + map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) { if (map->m_llen > map->m_plen) { DBG_BUGON(1); err = -EFSCORRUPTED; goto unmap_out; } if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) - map->m_algorithmformat = - Z_EROFS_COMPRESSION_INTERLACED; + map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; else - map->m_algorithmformat = - Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; } else { map->m_algorithmformat = vi->z_algorithmtype[0]; @@ -717,8 +490,10 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && - map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= EROFS_BLKSIZ)) { + (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE || + map->m_algorithmformat == Z_EROFS_COMPRESSION_ZSTD) && + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; @@ -726,47 +501,267 @@ static int z_erofs_do_map_blocks(struct inode *inode, unmap_out: erofs_unmap_metabuf(&m.map->buf); - erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", - __func__, map->m_la, map->m_pa, - map->m_llen, map->m_plen, map->m_flags); return err; } -int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, - int flags) +static int z_erofs_map_blocks_ext(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; + bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER; + unsigned int recsz = z_erofs_extent_recsize(vi->z_advise); + erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize), recsz); + bool in_mbox = erofs_inode_in_metabox(inode); + erofs_off_t lend = inode->i_size; + erofs_off_t l, r, mid, pa, la, lstart; + struct z_erofs_extent *ext; + unsigned int fmt; + bool last; + + map->m_flags = 0; + if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) { + if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) { + ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox); + if (IS_ERR(ext)) + return PTR_ERR(ext); + pa = le64_to_cpu(*(__le64 *)ext); + pos += sizeof(__le64); + lstart = 0; + } else { + lstart = round_down(map->m_la, 1 << vi->z_lclusterbits); + pos += (lstart >> vi->z_lclusterbits) * recsz; + pa = EROFS_NULL_ADDR; + } + + for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) { + ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox); + if (IS_ERR(ext)) + return PTR_ERR(ext); + map->m_plen = le32_to_cpu(ext->plen); + if (pa != EROFS_NULL_ADDR) { + map->m_pa = pa; + pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK; + } else { + map->m_pa = le32_to_cpu(ext->pstart_lo); + } + pos += recsz; + } + last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits)); + lend = min(lstart, lend); + lstart -= 1 << vi->z_lclusterbits; + } else { + lstart = lend; + for (l = 0, r = vi->z_extents; l < r; ) { + mid = l + (r - l) / 2; + ext = erofs_read_metabuf(&map->buf, sb, + pos + mid * recsz, in_mbox); + if (IS_ERR(ext)) + return PTR_ERR(ext); + + la = le32_to_cpu(ext->lstart_lo); + pa = le32_to_cpu(ext->pstart_lo) | + (u64)le32_to_cpu(ext->pstart_hi) << 32; + if (recsz > offsetof(struct z_erofs_extent, lstart_hi)) + la |= (u64)le32_to_cpu(ext->lstart_hi) << 32; + + if (la > map->m_la) { + r = mid; + if (la > lend) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + lend = la; + } else { + l = mid + 1; + if (map->m_la == la) + r = min(l + 1, r); + lstart = la; + map->m_plen = le32_to_cpu(ext->plen); + map->m_pa = pa; + } + } + last = (l >= vi->z_extents); + } + + if (lstart < lend) { + map->m_la = lstart; + if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { + map->m_flags = EROFS_MAP_FRAGMENT; + vi->z_fragmentoff = map->m_plen; + if (recsz > offsetof(struct z_erofs_extent, pstart_lo)) + vi->z_fragmentoff |= map->m_pa << 32; + } else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) { + map->m_flags |= EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED; + fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; + if (fmt) + map->m_algorithmformat = fmt - 1; + else if (interlaced && !erofs_blkoff(sb, map->m_pa)) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL) + map->m_flags |= EROFS_MAP_PARTIAL_REF; + map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK; + } + } + map->m_llen = lend - map->m_la; + return 0; +} + +static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + struct z_erofs_map_header *h; + erofs_off_t pos; int err = 0; - trace_z_erofs_map_blocks_iter_enter(inode, map, flags); + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { - map->m_llen = map->m_la + 1 - inode->i_size; - map->m_la = inode->i_size; - map->m_flags = 0; - goto out; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + h = erofs_read_metabuf(&map->buf, sb, pos, erofs_inode_in_metabox(inode)); + if (IS_ERR(h)) { + err = PTR_ERR(h); + goto out_unlock; } - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15); + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) { + vi->z_extents = le32_to_cpu(h->h_extents_lo) | + ((u64)le16_to_cpu(h->h_extents_hi) << 32); + goto done; + } - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | - EROFS_MAP_FRAGMENT; - goto out; + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_unlock; + } + if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_unlock; } - err = z_erofs_do_map_blocks(inode, map, flags); -out: - trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); + if (vi->z_idata_size || + (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { + struct erofs_map_blocks tm = { + .buf = __EROFS_BUF_INITIALIZER + }; - /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ - DBG_BUGON(err < 0 && err != -ENOMEM); + err = z_erofs_map_blocks_fo(inode, &tm, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&tm.buf); + if (err < 0) + goto out_unlock; + } +done: + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + +static int z_erofs_map_sanity_check(struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + u64 pend; + + if (!(map->m_flags & EROFS_MAP_ENCODED)) + return 0; + if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) { + erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel", + map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid); + return -EOPNOTSUPP; + } + if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX && + !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + map->m_algorithmformat, EROFS_I(inode)->nid); + return -EFSCORRUPTED; + } + if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + return -EOPNOTSUPP; + /* Filesystems beyond 48-bit physical block addresses are invalid */ + if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) || + (pend >> sbi->blkszbits) >= BIT_ULL(48))) + return -EFSCORRUPTED; + return 0; +} + +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags) +{ + struct erofs_inode *const vi = EROFS_I(inode); + int err = 0; + + trace_erofs_map_blocks_enter(inode, map, flags); + if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size; + map->m_flags = 0; + } else { + err = z_erofs_fill_inode(inode, map); + if (!err) { + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) + err = z_erofs_map_blocks_ext(inode, map, flags); + else + err = z_erofs_map_blocks_fo(inode, map, flags); + } + if (!err) + err = z_erofs_map_sanity_check(inode, map); + if (err) + map->m_llen = 0; + } + trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } @@ -787,7 +782,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ? IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c new file mode 100644 index 000000000000..55ff2ab5128e --- /dev/null +++ b/fs/erofs/zutil.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2018 HUAWEI, Inc. + * https://www.huawei.com/ + * Copyright (C) 2024 Alibaba Cloud + */ +#include "internal.h" + +struct z_erofs_gbuf { + spinlock_t lock; + void *ptr; + struct page **pages; + unsigned int nrpages; +}; + +static struct z_erofs_gbuf *z_erofs_gbufpool, *z_erofs_rsvbuf; +static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages, + z_erofs_rsv_nrpages; + +module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); +module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444); + +atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ + +/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); +static unsigned int shrinker_run_no; +static struct shrinker *erofs_shrinker_info; + +static unsigned int z_erofs_gbuf_id(void) +{ + return raw_smp_processor_id() % z_erofs_gbuf_count; +} + +void *z_erofs_get_gbuf(unsigned int requiredpages) + __acquires(gbuf->lock) +{ + struct z_erofs_gbuf *gbuf; + + migrate_disable(); + gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; + spin_lock(&gbuf->lock); + /* check if the buffer is too small */ + if (requiredpages > gbuf->nrpages) { + spin_unlock(&gbuf->lock); + migrate_enable(); + /* (for sparse checker) pretend gbuf->lock is still taken */ + __acquire(gbuf->lock); + return NULL; + } + return gbuf->ptr; +} + +void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock) +{ + struct z_erofs_gbuf *gbuf; + + gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()]; + DBG_BUGON(gbuf->ptr != ptr); + spin_unlock(&gbuf->lock); + migrate_enable(); +} + +int z_erofs_gbuf_growsize(unsigned int nrpages) +{ + static DEFINE_MUTEX(gbuf_resize_mutex); + struct page **tmp_pages = NULL; + struct z_erofs_gbuf *gbuf; + void *ptr, *old_ptr; + int last, i, j; + + mutex_lock(&gbuf_resize_mutex); + /* avoid shrinking gbufs, since no idea how many fses rely on */ + if (nrpages <= z_erofs_gbuf_nrpages) { + mutex_unlock(&gbuf_resize_mutex); + return 0; + } + + for (i = 0; i < z_erofs_gbuf_count; ++i) { + gbuf = &z_erofs_gbufpool[i]; + tmp_pages = kcalloc(nrpages, sizeof(*tmp_pages), GFP_KERNEL); + if (!tmp_pages) + goto out; + + for (j = 0; j < gbuf->nrpages; ++j) + tmp_pages[j] = gbuf->pages[j]; + do { + last = j; + j = alloc_pages_bulk(GFP_KERNEL, nrpages, + tmp_pages); + if (last == j) + goto out; + } while (j != nrpages); + + ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL); + if (!ptr) + goto out; + + spin_lock(&gbuf->lock); + kfree(gbuf->pages); + gbuf->pages = tmp_pages; + old_ptr = gbuf->ptr; + gbuf->ptr = ptr; + gbuf->nrpages = nrpages; + spin_unlock(&gbuf->lock); + if (old_ptr) + vunmap(old_ptr); + } + z_erofs_gbuf_nrpages = nrpages; +out: + if (i < z_erofs_gbuf_count && tmp_pages) { + for (j = 0; j < nrpages; ++j) + if (tmp_pages[j] && (j >= gbuf->nrpages || + tmp_pages[j] != gbuf->pages[j])) + __free_page(tmp_pages[j]); + kfree(tmp_pages); + } + mutex_unlock(&gbuf_resize_mutex); + return i < z_erofs_gbuf_count ? -ENOMEM : 0; +} + +int __init z_erofs_gbuf_init(void) +{ + unsigned int i, total = num_possible_cpus(); + + if (z_erofs_gbuf_count) + total = min(z_erofs_gbuf_count, total); + z_erofs_gbuf_count = total; + + /* The last (special) global buffer is the reserved buffer */ + total += !!z_erofs_rsv_nrpages; + + z_erofs_gbufpool = kcalloc(total, sizeof(*z_erofs_gbufpool), + GFP_KERNEL); + if (!z_erofs_gbufpool) + return -ENOMEM; + + if (z_erofs_rsv_nrpages) { + z_erofs_rsvbuf = &z_erofs_gbufpool[total - 1]; + z_erofs_rsvbuf->pages = kcalloc(z_erofs_rsv_nrpages, + sizeof(*z_erofs_rsvbuf->pages), GFP_KERNEL); + if (!z_erofs_rsvbuf->pages) { + z_erofs_rsvbuf = NULL; + z_erofs_rsv_nrpages = 0; + } + } + for (i = 0; i < total; ++i) + spin_lock_init(&z_erofs_gbufpool[i].lock); + return 0; +} + +void z_erofs_gbuf_exit(void) +{ + int i, j; + + for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) { + struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i]; + + if (gbuf->ptr) { + vunmap(gbuf->ptr); + gbuf->ptr = NULL; + } + + if (!gbuf->pages) + continue; + + for (j = 0; j < gbuf->nrpages; ++j) + if (gbuf->pages[j]) + put_page(gbuf->pages[j]); + kfree(gbuf->pages); + gbuf->pages = NULL; + } + kfree(z_erofs_gbufpool); +} + +struct page *__erofs_allocpage(struct page **pagepool, gfp_t gfp, bool tryrsv) +{ + struct page *page = *pagepool; + + if (page) { + *pagepool = (struct page *)page_private(page); + } else if (tryrsv && z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages) { + spin_lock(&z_erofs_rsvbuf->lock); + if (z_erofs_rsvbuf->nrpages) + page = z_erofs_rsvbuf->pages[--z_erofs_rsvbuf->nrpages]; + spin_unlock(&z_erofs_rsvbuf->lock); + } + if (!page) + page = alloc_page(gfp); + DBG_BUGON(page && page_ref_count(page) != 1); + return page; +} + +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + /* try to fill reserved global pool first */ + if (z_erofs_rsvbuf && z_erofs_rsvbuf->nrpages < + z_erofs_rsv_nrpages) { + spin_lock(&z_erofs_rsvbuf->lock); + if (z_erofs_rsvbuf->nrpages < z_erofs_rsv_nrpages) { + z_erofs_rsvbuf->pages[z_erofs_rsvbuf->nrpages++] + = page; + spin_unlock(&z_erofs_rsvbuf->lock); + continue; + } + spin_unlock(&z_erofs_rsvbuf->lock); + } + put_page(page); + } +} + +void erofs_shrinker_register(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_shrinker_unregister(struct super_block *sb) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + + mutex_lock(&sbi->umount_mutex); + while (!xa_empty(&sbi->managed_pslots)) { + z_erofs_shrink_scan(sbi, ~0UL); + cond_resched(); + } + spin_lock(&erofs_sb_list_lock); + list_del(&sbi->list); + spin_unlock(&erofs_sb_list_lock); + mutex_unlock(&sbi->umount_mutex); +} + +static unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt) ?: SHRINK_EMPTY; +} + +static unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + freed += z_erofs_shrink_scan(sbi, nr - freed); + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + +int __init erofs_init_shrinker(void) +{ + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + shrinker_register(erofs_shrinker_info); + return 0; +} + +void erofs_exit_shrinker(void) +{ + shrinker_free(erofs_shrinker_info); +} |
