summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/erofs.rst12
-rw-r--r--fs/erofs/Kconfig40
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h28
-rw-r--r--fs/erofs/data.c73
-rw-r--r--fs/erofs/decompressor.c139
-rw-r--r--fs/erofs/decompressor_lzma.c290
-rw-r--r--fs/erofs/erofs_fs.h73
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/internal.h105
-rw-r--r--fs/erofs/pcpubuf.c6
-rw-r--r--fs/erofs/super.c231
-rw-r--r--fs/erofs/utils.c19
-rw-r--r--fs/erofs/xattr.c4
-rw-r--r--fs/erofs/zdata.c175
-rw-r--r--fs/erofs/zdata.h7
-rw-r--r--fs/erofs/zmap.c65
-rw-r--r--include/linux/xz.h106
-rw-r--r--include/trace/events/erofs.h2
-rw-r--r--lib/decompress_unxz.c10
-rw-r--r--lib/xz/Kconfig13
-rw-r--r--lib/xz/xz_dec_lzma2.c182
-rw-r--r--lib/xz/xz_dec_stream.c6
-rw-r--r--lib/xz/xz_dec_syms.c9
-rw-r--r--lib/xz/xz_private.h3
25 files changed, 1281 insertions, 320 deletions
diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
index b97579b7d8fb..01df283c7d04 100644
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios:
immutable and bit-for-bit identical to the official golden image for
their releases due to security and other considerations and
- - hope to save some extra storage space with guaranteed end-to-end performance
- by using reduced metadata and transparent file compression, especially
- for those embedded devices with limited memory (ex, smartphone);
+ - hope to minimize extra storage space with guaranteed end-to-end performance
+ by using compact layout, transparent file compression and direct access,
+ especially for those embedded devices with limited memory and high-density
+ hosts with numerous containers;
Here is the main features of EROFS:
@@ -51,7 +52,9 @@ Here is the main features of EROFS:
- Support POSIX.1e ACLs by using xattrs;
- Support transparent data compression as an option:
- LZ4 algorithm with the fixed-sized output compression for high performance.
+ LZ4 algorithm with the fixed-sized output compression for high performance;
+
+ - Multiple device support for multi-layer container images.
The following git tree provides the file system user-space tools under
development (ex, formatting tool mkfs.erofs):
@@ -87,6 +90,7 @@ cache_strategy=%s Select a strategy for cached decompression from now on:
dax={always,never} Use direct access (no page cache). See
Documentation/filesystems/dax.rst.
dax A legacy option which is an alias for ``dax=always``.
+device=%s Specify a path to an extra device to be used together.
=================== =========================================================
On-disk details
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 14b747026742..f57255ab88ed 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -6,16 +6,22 @@ config EROFS_FS
select FS_IOMAP
select LIBCRC32C
help
- EROFS (Enhanced Read-Only File System) is a lightweight
- read-only file system with modern designs (eg. page-sized
- blocks, inline xattrs/data, etc.) for scenarios which need
- high-performance read-only requirements, e.g. Android OS
- for mobile phones and LIVECDs.
+ EROFS (Enhanced Read-Only File System) is a lightweight read-only
+ file system with modern designs (e.g. no buffer heads, inline
+ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+ scenarios which need high-performance read-only solutions, e.g.
+ smartphones with Android OS, LiveCDs and high-density hosts with
+ numerous containers;
- It also provides fixed-sized output compression support,
- which improves storage density, keeps relatively higher
- compression ratios, which is more useful to achieve high
- performance for embedded devices with limited memory.
+ It also provides fixed-sized output compression support in order to
+ improve storage density as well as keep relatively higher compression
+ ratios and implements in-place decompression to reuse the file page
+ for compressed data temporarily with proper strategies, which is
+ quite useful to ensure guaranteed end-to-end runtime decompression
+ performance under extremely memory pressure without extra cost.
+
+ See the documentation at <file:Documentation/filesystems/erofs.rst>
+ for more details.
If unsure, say N.
@@ -76,3 +82,19 @@ config EROFS_FS_ZIP
Enable fixed-sized output compression for EROFS.
If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+ bool "EROFS LZMA compressed data support"
+ depends on EROFS_FS_ZIP
+ select XZ_DEC
+ select XZ_DEC_MICROLZMA
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing LZMA compressed data, specifically called microLZMA. it
+ gives better compression ratios than the LZ4 algorithm, at the
+ expense of more CPU overhead.
+
+ LZMA support is an experimental feature for now and so most file
+ systems will be readable without selecting this option.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 1f9aced49070..756fe2d65272 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 3701c72bacb2..579406504919 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -8,11 +8,6 @@
#include "internal.h"
-enum {
- Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
- Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
@@ -25,6 +20,12 @@ struct z_erofs_decompress_req {
bool inplace_io, partial_decoding;
};
+struct z_erofs_decompressor {
+ int (*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ char *name;
+};
+
/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
@@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page)
return true;
}
-static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
struct page *page)
{
if (!z_erofs_is_shortlived_page(page))
@@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
put_page(page);
} else {
/* follow the pcluster rule above. */
- set_page_private(page, 0);
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
}
return true;
}
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
+ struct page **pagepool);
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9db829715652..808234d9190c 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode,
erofs_off_t pos;
int err = 0;
+ map->m_deviceid = 0;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
@@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode,
map->m_flags = 0;
break;
default:
- /* only one device is supported for now */
- if (idx->device_id) {
- erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
- le16_to_cpu(idx->device_id),
- chunknr, vi->nid);
- err = -EFSCORRUPTED;
- goto out_unlock;
- }
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
break;
@@ -155,11 +150,55 @@ out:
return err;
}
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+ struct erofs_device_info *dif;
+ int id;
+
+ /* primary device by default */
+ map->m_bdev = sb->s_bdev;
+ map->m_daxdev = EROFS_SB(sb)->dax_dev;
+
+ if (map->m_deviceid) {
+ down_read(&devs->rwsem);
+ dif = idr_find(&devs->tree, map->m_deviceid - 1);
+ if (!dif) {
+ up_read(&devs->rwsem);
+ return -ENODEV;
+ }
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ up_read(&devs->rwsem);
+ } else if (devs->extra_devices) {
+ down_read(&devs->rwsem);
+ idr_for_each_entry(&devs->tree, dif, id) {
+ erofs_off_t startoff, length;
+
+ if (!dif->mapped_blkaddr)
+ continue;
+ startoff = blknr_to_addr(dif->mapped_blkaddr);
+ length = blknr_to_addr(dif->blocks);
+
+ if (map->m_pa >= startoff &&
+ map->m_pa < startoff + length) {
+ map->m_pa -= startoff;
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ break;
+ }
+ }
+ up_read(&devs->rwsem);
+ }
+ return 0;
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
map.m_la = offset;
map.m_llen = length;
@@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(inode->i_sb, &mdev);
+ if (ret)
+ return ret;
+
+ iomap->bdev = mdev.m_bdev;
+ iomap->dax_dev = mdev.m_daxdev;
iomap->offset = map.m_la;
iomap->length = map.m_llen;
iomap->flags = 0;
@@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->type = IOMAP_INLINE;
ipage = erofs_get_meta_page(inode->i_sb,
- erofs_blknr(map.m_pa));
+ erofs_blknr(mdev.m_pa));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
iomap->inline_data = page_address(ipage) +
- erofs_blkoff(map.m_pa);
+ erofs_blkoff(mdev.m_pa);
iomap->private = ipage;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_pa;
+ iomap->addr = mdev.m_pa;
}
return 0;
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index a5bc4b1b7813..bf37fc76b182 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -16,17 +16,6 @@
#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
#endif
-struct z_erofs_decompressor {
- /*
- * if destpages have sparsed pages, fill them with bounce pages.
- * it also check whether destpages indicate continuous physical memory.
- */
- int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
- int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
- char *name;
-};
-
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int size)
@@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb,
return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
}
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nr =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
void *inpage, unsigned int *inputmargin, int *maptype,
bool support_0padding)
{
@@ -189,7 +182,8 @@ docopy:
return src;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
+ u8 *out)
{
unsigned int inputmargin;
u8 *headpage, *src;
@@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
}
rq->inputsize -= inputmargin;
- src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
- support_0padding);
+ src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
+ &maptype, support_0padding);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
ret, rq->inputsize, inputmargin, rq->outputsize);
- WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
16, 1, src + inputmargin, rq->inputsize, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
@@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
+ } else {
+ ret = 0;
}
if (maptype == 0) {
@@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
return ret;
}
-static struct z_erofs_decompressor decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
- .name = "shifted"
- },
- [Z_EROFS_COMPRESSION_LZ4] = {
- .prepare_destpages = z_erofs_lz4_prepare_destpages,
- .decompress = z_erofs_lz4_decompress,
- .name = "lz4"
- },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
- unsigned short pageofs_out,
- unsigned int outputsize)
-{
- const char *end = dst + outputsize;
- const unsigned int righthalf = PAGE_SIZE - pageofs_out;
- const char *cur = dst - pageofs_out;
-
- while (cur < end) {
- struct page *const page = *out++;
-
- if (page) {
- char *buf = kmap_atomic(page);
-
- if (cur >= dst) {
- memcpy(buf, cur, min_t(uint, PAGE_SIZE,
- end - cur));
- } else {
- memcpy(buf + pageofs_out, cur + pageofs_out,
- min_t(uint, righthalf, end - cur));
- }
- kunmap_atomic(buf);
- }
- cur += PAGE_SIZE;
- }
-}
-
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const struct z_erofs_decompressor *alg = decompressors + rq->alg;
unsigned int dst_maptype;
void *dst;
int ret;
- /* two optimized fast paths only for non bigpcluster cases yet */
- if (rq->inputsize <= PAGE_SIZE) {
- if (nrpages_out == 1 && !rq->inplace_io) {
- DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
- dst_maptype = 0;
- goto dstmap_out;
- }
-
- /*
- * For the case of small output size (especially much less
- * than PAGE_SIZE), memcpy the decompressed data rather than
- * compressed data is preferred.
- */
- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
- dst = erofs_get_pcpubuf(1);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- rq->inplace_io = false;
- ret = alg->decompress(rq, dst);
- if (!ret)
- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
- rq->outputsize);
-
- erofs_put_pcpubuf(dst);
- return ret;
- }
+ /* one optimized fast path only for non bigpcluster cases yet */
+ if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
}
/* general decoding path which can be used for all cases */
- ret = alg->prepare_destpages(rq, pagepool);
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
if (ret < 0)
return ret;
if (ret) {
@@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
dst_maptype = 2;
dstmap_out:
- ret = alg->decompress(rq, dst + rq->pageofs_out);
+ ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
if (!dst_maptype)
kunmap_atomic(dst);
@@ -360,8 +294,8 @@ dstmap_out:
return ret;
}
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
return 0;
}
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .decompress = z_erofs_shifted_transform,
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+ [Z_EROFS_COMPRESSION_LZMA] = {
+ .decompress = z_erofs_lzma_decompress,
+ .name = "lzma"
+ },
+#endif
+};
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+ struct page **pagepool)
{
- if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
- return z_erofs_shifted_transform(rq, pagepool);
- return z_erofs_decompress_generic(rq, pagepool);
+ return decompressors[rq->alg].decompress(rq, pagepool);
}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644
index 000000000000..50045510a1f4
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+ struct z_erofs_lzma *next;
+ struct xz_dec_microlzma *state;
+ struct xz_buf buf;
+ u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_lzma_avail_strms) {
+ struct z_erofs_lzma *strm;
+
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ DBG_BUGON(1);
+ return;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ while (strm) {
+ struct z_erofs_lzma *n = strm->next;
+
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ kfree(strm);
+ --z_erofs_lzma_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int z_erofs_lzma_init(void)
+{
+ unsigned int i;
+
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_lzma_nstrms)
+ z_erofs_lzma_nstrms = num_possible_cpus();
+
+ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+ if (!strm) {
+ z_erofs_lzma_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ ++z_erofs_lzma_avail_strms;
+ }
+ return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size)
+{
+ static DEFINE_MUTEX(lzma_resize_mutex);
+ unsigned int dict_size, i;
+ struct z_erofs_lzma *strm, *head = NULL;
+ int err;
+
+ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ if (lzma->format) {
+ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+ le16_to_cpu(lzma->format));
+ return -EINVAL;
+ }
+ dict_size = le32_to_cpu(lzma->dict_size);
+ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+ erofs_err(sb, "unsupported lzma dictionary size %u",
+ dict_size);
+ return -EINVAL;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+ mutex_lock(&lzma_resize_mutex);
+
+ if (z_erofs_lzma_max_dictsize >= dict_size) {
+ mutex_unlock(&lzma_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+ struct z_erofs_lzma *last;
+
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq,
+ READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ for (last = strm; last->next; last = last->next)
+ ++i;
+ last->next = head;
+ head = strm;
+ }
+
+ err = 0;
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ for (strm = head; strm; strm = strm->next) {
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+ if (!strm->state)
+ err = -ENOMEM;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_lzma_lock);
+ DBG_BUGON(z_erofs_lzma_head);
+ z_erofs_lzma_head = head;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ z_erofs_lzma_max_dictsize = dict_size;
+ mutex_unlock(&lzma_resize_mutex);
+ return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ unsigned int inputmargin, inlen, outlen, pageofs;
+ struct z_erofs_lzma *strm;
+ u8 *kin;
+ bool bounced = false;
+ int no, ni, j, err = 0;
+
+ /* 1. get the exact LZMA compressed size */
+ kin = kmap(*rq->in);
+ inputmargin = 0;
+ while (!kin[inputmargin & ~PAGE_MASK])
+ if (!(++inputmargin & ~PAGE_MASK))
+ break;
+
+ if (inputmargin >= PAGE_SIZE) {
+ kunmap(*rq->in);
+ return -EFSCORRUPTED;
+ }
+ rq->inputsize -= inputmargin;
+
+ /* 2. get an available lzma context */
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = strm->next;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ /* 3. multi-call decompress */
+ inlen = rq->inputsize;
+ outlen = rq->outputsize;
+ xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ !rq->partial_decoding);
+ pageofs = rq->pageofs_out;
+ strm->buf.in = kin + inputmargin;
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
+ inlen -= strm->buf.in_size;
+ strm->buf.out = NULL;
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = 0;
+
+ for (ni = 0, no = -1;;) {
+ enum xz_ret xz_err;
+
+ if (strm->buf.out_pos == strm->buf.out_size) {
+ if (strm->buf.out) {
+ kunmap(rq->out[no]);
+ strm->buf.out = NULL;
+ }
+
+ if (++no >= nrpages_out || !outlen) {
+ erofs_err(rq->sb, "decompressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = min_t(u32, outlen,
+ PAGE_SIZE - pageofs);
+ outlen -= strm->buf.out_size;
+ if (rq->out[no])
+ strm->buf.out = kmap(rq->out[no]) + pageofs;
+ pageofs = 0;
+ } else if (strm->buf.in_pos == strm->buf.in_size) {
+ kunmap(rq->in[ni]);
+
+ if (++ni >= nrpages_in || !inlen) {
+ erofs_err(rq->sb, "compressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+ inlen -= strm->buf.in_size;
+ kin = kmap(rq->in[ni]);
+ strm->buf.in = kin;
+ bounced = false;
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Otherwise, Use short-lived pages
+ * from the on-stack pagepool where pages share with the same
+ * request.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+ strm->buf.in = strm->bounce;
+ bounced = true;
+ }
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+ if (xz_err != XZ_OK) {
+ if (xz_err == XZ_STREAM_END && !outlen)
+ break;
+ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ xz_err, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+ if (no < nrpages_out && strm->buf.out)
+ kunmap(rq->in[no]);
+ if (ni < nrpages_in)
+ kunmap(rq->in[ni]);
+ /* 4. push back LZMA stream context to the global list */
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up(&z_erofs_lzma_wq);
+ return err;
+}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b0b23f41abc3..083997a034e5 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -21,14 +21,29 @@
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
#define EROFS_SB_EXTSLOT_SIZE 16
+struct erofs_deviceslot {
+ union {
+ u8 uuid[16]; /* used for device manager later */
+ u8 userdata[64]; /* digest(sha256), etc. */
+ } u;
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
+
/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
@@ -54,7 +69,9 @@ struct erofs_super_block {
/* customized sliding window size instead of 64k by default */
__le16 lz4_max_distance;
} __packed u1;
- __u8 reserved2[42];
+ __le16 extra_devices; /* # of devices besides the primary device */
+ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
+ __u8 reserved2[38];
};
/*
@@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 8-byte inode chunk indexes */
struct erofs_inode_chunk_index {
__le16 advise; /* always 0, don't care for now */
- __le16 device_id; /* back-end storage id, always 0 for now */
+ __le16 device_id; /* back-end storage id (with bits masked) */
__le32 blkaddr; /* start block address of this inode chunk */
};
@@ -247,10 +264,11 @@ struct erofs_inode_chunk_index {
/* available compression algorithm types (for h_algorithmtype) */
enum {
- Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_MAX
};
-#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
/* 14 bytes (+ length field = 16 bytes) */
struct z_erofs_lz4_cfgs {
@@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs {
u8 reserved[10];
} __packed;
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+ __le32 dict_size;
+ __le16 format;
+ u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
@@ -288,35 +315,34 @@ struct z_erofs_map_header {
#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
/*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- * 0 - literal (uncompressed) cluster
- * 1 - compressed cluster (for the head logical cluster)
- * 2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ * 0 - literal (uncompressed) lcluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
*
* In detail,
- * 0 - literal (uncompressed) cluster,
+ * 0 - literal (uncompressed) lcluster,
* di_advise = 0
- * di_clusterofs = the literal data offset of the cluster
- * di_blkaddr = the blkaddr of the literal cluster
+ * di_clusterofs = the literal data offset of the lcluster
+ * di_blkaddr = the blkaddr of the literal pcluster
*
- * 1 - compressed cluster (for the head logical cluster)
- * di_advise = 1
- * di_clusterofs = the decompressed data offset of the cluster
- * di_blkaddr = the blkaddr of the compressed cluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * di_advise = 1 or 3
+ * di_clusterofs = the decompressed data offset of the lcluster
+ * di_blkaddr = the blkaddr of the compressed pcluster
*
- * 2 - compressed cluster (for the other logical clusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
* di_advise = 2
* di_clusterofs =
- * the decompressed data offset in its own head cluster
- * di_u.delta[0] = distance to its corresponding head cluster
- * di_u.delta[1] = distance to its corresponding tail cluster
- * (di_advise could be 0, 1 or 2)
+ * the decompressed data offset in its own HEAD lcluster
+ * di_u.delta[0] = distance to this HEAD lcluster
+ * di_u.delta[1] = distance to the next HEAD lcluster
*/
enum {
Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
- Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
Z_EROFS_VLE_CLUSTER_TYPE_MAX
};
@@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
/* keep in sync between 2 index structures for better extendibility */
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
sizeof(struct z_erofs_vle_decompressed_index));
+ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index a552399e211d..2345f1de438e 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_flags &= ~S_DAX;
- if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
vi->datalayout == EROFS_INODE_FLAT_PLAIN)
inode->i_flags |= S_DAX;
if (!nblks)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 9524e155b38f..3265688af7f9 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,7 +47,16 @@ typedef u64 erofs_off_t;
/* data type for filesystem-wide blocks number */
typedef u32 erofs_blk_t;
-struct erofs_fs_context {
+struct erofs_device_info {
+ char *path;
+ struct block_device *bdev;
+ struct dax_device *dax_dev;
+
+ u32 blocks;
+ u32 mapped_blkaddr;
+};
+
+struct erofs_mount_opts {
#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
@@ -60,6 +69,18 @@ struct erofs_fs_context {
unsigned int mount_opt;
};
+struct erofs_dev_context {
+ struct idr tree;
+ struct rw_semaphore rwsem;
+
+ unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+ struct erofs_mount_opts opt;
+ struct erofs_dev_context *devs;
+};
+
/* all filesystem-wide lz4 configurations */
struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
@@ -69,6 +90,7 @@ struct erofs_sb_lz4_info {
};
struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
struct list_head list;
@@ -85,12 +107,16 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct erofs_dev_context *devs;
struct dax_device *dax_dev;
- u32 blocks;
+ u64 total_blocks;
+ u32 primarydevice_blocks;
+
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
u32 xattr_blkaddr;
#endif
+ u16 device_id_mask; /* valid bits of device id to be used */
/* inode slot unit size in bit shift */
unsigned char islotbits;
@@ -108,8 +134,6 @@ struct erofs_sb_info {
u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
-
- struct erofs_fs_context ctx; /* options */
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -121,9 +145,9 @@ struct erofs_sb_info {
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
enum {
EROFS_ZIP_CACHE_DISABLED,
@@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
EROFS_I_DATALAYOUT_BITS);
}
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
@@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops;
* of the corresponding uncompressed data in the file.
*/
enum {
- BH_Zipped = BH_PrivateStart,
+ BH_Encoded = BH_PrivateStart,
BH_FullMapped,
};
@@ -346,8 +384,8 @@ enum {
#define EROFS_MAP_MAPPED (1 << BH_Mapped)
/* Located in metadata (could be copied from bd_inode) */
#define EROFS_MAP_META (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
@@ -355,6 +393,8 @@ struct erofs_map_blocks {
erofs_off_t m_pa, m_la;
u64 m_plen, m_llen;
+ unsigned short m_deviceid;
+ char m_algorithmformat;
unsigned int m_flags;
struct page *mpage;
@@ -367,6 +407,13 @@ struct erofs_map_blocks {
* approach instead if possible since it's more metadata lightweight.)
*/
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE 0x0004
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
/* zmap.c */
extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+struct erofs_map_dev {
+ struct block_device *m_bdev;
+ struct dax_device *m_daxdev;
+
+ erofs_off_t m_pa;
+ unsigned int m_deviceid;
+};
+
/* data.c */
extern const struct file_operations erofs_file_fops;
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+ struct page *page)
+{
+ set_page_private(page, (unsigned long)*pagepool);
+ *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size) {
+ if (lzma) {
+ erofs_err(sb, "lzma algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
index 6c885575128a..a2efd833d1b6 100644
--- a/fs/erofs/pcpubuf.c
+++ b/fs/erofs/pcpubuf.c
@@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
{
static DEFINE_MUTEX(pcb_resize_mutex);
static unsigned int pcb_nrpages;
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
int delta, cpu, ret, i;
mutex_lock(&pcb_resize_mutex);
@@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
vunmap(old_ptr);
free_pagearray:
while (i)
- list_add(&oldpages[--i]->lru, &pagepool);
+ erofs_pagepool_add(&pagepool, oldpages[--i]);
kfree(oldpages);
if (ret)
break;
}
pcb_nrpages = nrpages;
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
out:
mutex_unlock(&pcb_resize_mutex);
return ret;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 11b88559f8bf..6a969b1e0ee6 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
case Z_EROFS_COMPRESSION_LZ4:
ret = z_erofs_load_lz4_config(sb, dsb, data, size);
break;
+ case Z_EROFS_COMPRESSION_LZMA:
+ ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+ break;
default:
DBG_BUGON(1);
ret = -EFAULT;
@@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
+static int erofs_init_devices(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int ondisk_extradevs;
+ erofs_off_t pos;
+ struct page *page = NULL;
+ struct erofs_device_info *dif;
+ struct erofs_deviceslot *dis;
+ void *ptr;
+ int id, err = 0;
+
+ sbi->total_blocks = sbi->primarydevice_blocks;
+ if (!erofs_sb_has_device_table(sbi))
+ ondisk_extradevs = 0;
+ else
+ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+ if (ondisk_extradevs != sbi->devs->extra_devices) {
+ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+ ondisk_extradevs, sbi->devs->extra_devices);
+ return -EINVAL;
+ }
+ if (!ondisk_extradevs)
+ return 0;
+
+ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+ down_read(&sbi->devs->rwsem);
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ erofs_blk_t blk = erofs_blknr(pos);
+ struct block_device *bdev;
+
+ if (!page || page->index != blk) {
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ page = erofs_get_meta_page(sb, blk);
+ if (IS_ERR(page)) {
+ up_read(&sbi->devs->rwsem);
+ return PTR_ERR(page);
+ }
+ ptr = kmap(page);
+ }
+ dis = ptr + erofs_blkoff(pos);
+
+ bdev = blkdev_get_by_path(dif->path,
+ FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
+ goto err_out;
+ }
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev);
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ pos += EROFS_DEVT_SLOT_SIZE;
+ }
+err_out:
+ up_read(&sbi->devs->rwsem);
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ return err;
+}
+
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
@@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->blocks = le32_to_cpu(dsb->blocks);
+ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_load_compr_cfgs(sb, dsb);
else
ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ if (ret < 0)
+ goto out;
+
+ /* handle multiple devices */
+ ret = erofs_init_devices(sb, dsb);
out:
kunmap(page);
put_page(page);
@@ -340,15 +421,15 @@ out:
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->max_sync_decompress_pages = 3;
- ctx->readahead_sync_decompress = false;
+ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ ctx->opt.max_sync_decompress_pages = 3;
+ ctx->opt.readahead_sync_decompress = false;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
#endif
}
@@ -358,6 +439,7 @@ enum {
Opt_cache_strategy,
Opt_dax,
Opt_dax_enum,
+ Opt_device,
Opt_err
};
@@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
erofs_param_cache_strategy),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
+ fsparam_string("device", Opt_device),
{}
};
@@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(ctx, DAX_ALWAYS);
- clear_opt(ctx, DAX_NEVER);
+ set_opt(&ctx->opt, DAX_ALWAYS);
+ clear_opt(&ctx->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(ctx, DAX_NEVER);
- clear_opt(ctx, DAX_ALWAYS);
+ set_opt(&ctx->opt, DAX_NEVER);
+ clear_opt(&ctx->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+ struct erofs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ struct erofs_device_info *dif;
+ int opt, ret;
opt = fs_parse(fc, erofs_fs_parameters, param, &result);
if (opt < 0)
@@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
else
- clear_opt(ctx, XATTR_USER);
+ clear_opt(&ctx->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
else
- clear_opt(ctx, POSIX_ACL);
+ clear_opt(&ctx->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = result.uint_32;
+ ctx->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
if (!erofs_fc_set_dax_mode(fc, result.uint_32))
return -EINVAL;
break;
+ case Opt_device:
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif)
+ return -ENOMEM;
+ dif->path = kstrdup(param->string, GFP_KERNEL);
+ if (!dif->path) {
+ kfree(dif);
+ return -ENOMEM;
+ }
+ down_write(&ctx->devs->rwsem);
+ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&ctx->devs->rwsem);
+ if (ret < 0) {
+ kfree(dif->path);
+ kfree(dif);
+ return ret;
+ }
+ ++ctx->devs->extra_devices;
+ break;
default:
return -ENOPARAM;
}
@@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sbi->opt = ctx->opt;
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->devs = ctx->devs;
+ ctx->devs = NULL;
+
err = erofs_read_superblock(sb);
if (err)
return err;
- if (test_opt(ctx, DAX_ALWAYS) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) &&
!dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(ctx, DAX_ALWAYS);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
}
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &erofs_sops;
sb->s_xattr = erofs_xattr_handlers;
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
else
sb->s_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
-
#ifdef CONFIG_EROFS_FS_ZIP
xa_init(&sbi->managed_pslots);
#endif
@@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
DBG_BUGON(!sb_rdonly(sb));
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&ctx->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
+ sbi->opt = ctx->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
}
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+ struct erofs_device_info *dif = ptr;
+
+ fs_put_dax(dif->dax_dev);
+ if (dif->bdev)
+ blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ kfree(dif->path);
+ kfree(dif);
+ return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+ if (!devs)
+ return;
+ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_destroy(&devs->tree);
+ kfree(devs);
+}
+
static void erofs_fc_free(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ erofs_free_dev_context(ctx->devs);
+ kfree(ctx);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = {
static int erofs_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
- if (!fc->fs_private)
- return -ENOMEM;
+ struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- /* set default mount options */
- erofs_default_options(fc->fs_private);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!ctx->devs) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ fc->fs_private = ctx;
+ idr_init(&ctx->devs->tree);
+ init_rwsem(&ctx->devs->rwsem);
+ erofs_default_options(ctx);
fc->ops = &erofs_context_ops;
-
return 0;
}
@@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb)
sbi = EROFS_SB(sb);
if (!sbi)
return;
+
+ erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
kfree(sbi);
sb->s_fs_info = NULL;
@@ -706,6 +843,10 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
+ err = z_erofs_lzma_init();
+ if (err)
+ goto lzma_err;
+
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
@@ -720,6 +861,8 @@ static int __init erofs_module_init(void)
fs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_lzma_exit();
+lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -730,11 +873,13 @@ icache_err:
static void __exit erofs_module_exit(void)
{
unregister_filesystem(&erofs_fs_type);
- z_erofs_exit_zip_subsystem();
- erofs_exit_shrinker();
- /* Ensure all RCU free inodes are safe before cache is destroyed. */
+ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+
+ z_erofs_exit_zip_subsystem();
+ z_erofs_lzma_exit();
+ erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
@@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
- buf->f_blocks = sbi->blocks;
+ buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
buf->f_files = ULLONG_MAX;
@@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
- struct erofs_fs_context *ctx = &sbi->ctx;
+ struct erofs_mount_opts *opt = &sbi->opt;
#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(ctx, XATTR_USER))
+ if (test_opt(opt, XATTR_USER))
seq_puts(seq, ",user_xattr");
else
seq_puts(seq, ",nouser_xattr");
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(opt, POSIX_ACL))
seq_puts(seq, ",acl");
else
seq_puts(seq, ",noacl");
#endif
#ifdef CONFIG_EROFS_FS_ZIP
- if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
seq_puts(seq, ",cache_strategy=disabled");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
seq_puts(seq, ",cache_strategy=readahead");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
#endif
- if (test_opt(ctx, DAX_ALWAYS))
+ if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
- if (test_opt(ctx, DAX_NEVER))
+ if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
return 0;
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index bd86067a63f7..84da2c280012 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -6,20 +6,29 @@
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
- struct page *page;
+ struct page *page = *pagepool;
- if (!list_empty(pool)) {
- page = lru_to_page(pool);
+ if (page) {
DBG_BUGON(page_ref_count(page) != 1);
- list_del(&page->lru);
+ *pagepool = (struct page *)page_private(page);
} else {
page = alloc_page(gfp);
}
return page;
}
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ put_page(page);
+ }
+}
+
#ifdef CONFIG_EROFS_FS_ZIP
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 778f2c52295d..01c581e93c5f 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
static bool erofs_xattr_user_list(struct dentry *dentry)
{
- return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
}
static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
switch (handler->flags) {
case EROFS_XATTR_INDEX_USER:
- if (!test_opt(&sbi->ctx, XATTR_USER))
+ if (!test_opt(&sbi->opt, XATTR_USER))
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 11c7a1aaebad..bcb1b91b234f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct z_erofs_pcluster *pcl = clt->pcl;
bool standalone = true;
@@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
continue;
- if (page) {
+ if (page)
put_page(page);
- } else if (newpage) {
- set_page_private(newpage, 0);
- list_add(&newpage->lru, pagepool);
- }
+ else if (newpage)
+ erofs_pagepool_add(pagepool, newpage);
}
/*
@@ -476,6 +474,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
struct erofs_workgroup *grp;
int err;
+ if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
/* no available pcluster, let's allocate one */
pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
if (IS_ERR(pcl))
@@ -483,16 +486,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
atomic_set(&pcl->obj.refcount, 1);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
+ pcl->algorithmformat = map->m_algorithmformat;
pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
(map->m_flags & EROFS_MAP_FULL_MAPPED ?
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
- if (map->m_flags & EROFS_MAP_ZIPPED)
- pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
- else
- pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = clt->owned_head;
clt->mode = COLLECT_PRIMARY_FOLLOWED;
@@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct list_head *pagepool)
+ struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -695,7 +693,7 @@ restart_now:
goto err_out;
/* preload all compressed pages (maybe downgrade role if necessary) */
- if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
+ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
@@ -796,7 +794,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
/* Use workqueue and sync decompression for atomic contexts only */
if (in_atomic() || irqs_disabled()) {
queue_work(z_erofs_workqueue, &io->u.work);
- sbi->ctx.readahead_sync_decompress = true;
+ sbi->opt.readahead_sync_decompress = true;
return;
}
z_erofs_decompressqueue_work(&io->u.work);
@@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
static int z_erofs_decompress_pcluster(struct super_block *sb,
struct z_erofs_pcluster *pcl,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
struct z_erofs_pagevec_ctor ctor;
@@ -1036,7 +1034,7 @@ out:
}
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct list_head *pagepool)
+ struct page **pagepool)
{
z_erofs_next_pcluster_t owned = io->head;
@@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
z_erofs_decompress_queue(bgq, &pagepool);
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
kvfree(bgq);
}
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
- struct list_head *pagepool,
+ struct page **pagepool,
struct address_space *mc,
gfp_t gfp)
{
@@ -1173,7 +1171,7 @@ repeat:
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
}
@@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool,
+ struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
@@ -1266,8 +1264,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
- /* since bio will be NULL, no need to initialize last_index */
+ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
+ struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
@@ -1279,6 +1278,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
q[JQ_SUBMIT]->head = owned_head;
do {
+ struct erofs_map_dev mdev;
struct z_erofs_pcluster *pcl;
pgoff_t cur, end;
unsigned int i = 0;
@@ -1290,7 +1290,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- cur = pcl->obj.index;
+ /* no device id here, thus it will always succeed */
+ mdev = (struct erofs_map_dev) {
+ .m_pa = blknr_to_addr(pcl->obj.index),
+ };
+ (void)erofs_map_dev(sb, &mdev);
+
+ cur = erofs_blknr(mdev.m_pa);
end = cur + pcl->pclusterpages;
/* close the main owned chain at first */
@@ -1306,7 +1312,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
if (!page)
continue;
- if (bio && cur != last_index + 1) {
+ if (bio && (cur != last_index + 1 ||
+ last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
bio = NULL;
@@ -1314,9 +1321,10 @@ submit_bio_retry:
if (!bio) {
bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
-
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, sb->s_bdev);
+
+ bio_set_dev(bio, mdev.m_bdev);
+ last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
@@ -1355,7 +1363,7 @@ submit_bio_retry:
static void z_erofs_runqueue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool, bool force_fg)
+ struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
@@ -1377,18 +1385,87 @@ static void z_erofs_runqueue(struct super_block *sb,
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+ struct readahead_control *rac,
+ erofs_off_t end,
+ struct page **pagepool,
+ bool backmost)
+{
+ struct inode *inode = f->inode;
+ struct erofs_map_blocks *map = &f->map;
+ erofs_off_t cur;
+ int err;
+
+ if (backmost) {
+ map->m_la = end;
+ err = z_erofs_map_blocks_iter(inode, map,
+ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+ /* expend ra for the trailing edge if readahead */
+ if (rac) {
+ loff_t newstart = readahead_pos(rac);
+
+ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+ readahead_expand(rac, newstart, cur - newstart);
+ return;
+ }
+ end = round_up(end, PAGE_SIZE);
+ } else {
+ end = round_up(map->m_la, PAGE_SIZE);
+
+ if (!map->m_llen)
+ return;
+ }
+
+ cur = map->m_la + map->m_llen - 1;
+ while (cur >= end) {
+ pgoff_t index = cur >> PAGE_SHIFT;
+ struct page *page;
+
+ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+ if (!page)
+ goto skip;
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto skip;
+ }
+
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ put_page(page);
+skip:
+ if (cur < PAGE_SIZE)
+ break;
+ cur = (index << PAGE_SHIFT) - 1;
+ }
+}
+
static int z_erofs_readpage(struct file *file, struct page *page)
{
struct inode *const inode = page->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *pagepool = NULL;
int err;
- LIST_HEAD(pagepool);
trace_erofs_readpage(page, false);
-
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+ &pagepool, true);
err = z_erofs_do_read_page(&f, page, &pagepool);
+ z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
@@ -1400,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
if (f.map.mpage)
put_page(f.map.mpage);
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
return err;
}
@@ -1409,29 +1485,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
- unsigned int nr_pages = readahead_count(rac);
- bool sync = (sbi->ctx.readahead_sync_decompress &&
- nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *page, *head = NULL;
- LIST_HEAD(pagepool);
-
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ struct page *pagepool = NULL, *head = NULL, *page;
+ unsigned int nr_pages;
f.readahead = true;
f.headoffset = readahead_pos(rac);
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
-
- /*
- * A pure asynchronous readahead is indicated if
- * a PG_readahead marked page is hitted at first.
- * Let's also do asynchronous decompression for this case.
- */
- sync &= !(PageReadahead(page) && !head);
+ z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+ readahead_length(rac) - 1, &pagepool, true);
+ nr_pages = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ while ((page = readahead_page(rac))) {
set_page_private(page, (unsigned long)head);
head = page;
}
@@ -1450,16 +1516,15 @@ static void z_erofs_readahead(struct readahead_control *rac)
page->index, EROFS_I(inode)->nid);
put_page(page);
}
-
+ z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ sbi->opt.readahead_sync_decompress &&
+ nr_pages <= sbi->opt.max_sync_decompress_pages);
if (f.map.mpage)
put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 3a008f1b9f78..879df5362777 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -94,13 +94,6 @@ struct z_erofs_decompressqueue {
} u;
};
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
-
#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7a6df35fdc91..660489a7fb64 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err;
+ int err, headnr;
erofs_off_t pos;
struct page *page;
void *kaddr;
@@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
- vi->z_algorithmtype[0], vi->nid);
+ headnr = 0;
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
goto unmap_done;
}
@@ -111,7 +113,7 @@ struct z_erofs_maprecorder {
unsigned long lcn;
/* compression extent information gathered */
- u8 type;
+ u8 type, headtype;
u16 clusterofs;
u16 delta[2];
erofs_blk_t pblk, compressedlcs;
@@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
- if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
}
return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
map->m_la = (lcn << lclusterbits) | m->clusterofs;
break;
default:
@@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
int err;
DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
- if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type != m->headtype);
+
+ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
map->m_plen = 1 << lclusterbits;
return 0;
}
-
lcn = m->lcn + 1;
if (m->compressedlcs)
goto out;
@@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
switch (m->type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
@@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
DBG_BUGON(!m->delta[1] &&
m->clusterofs != 1 << lclusterbits);
} else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
/* go on until the next HEAD lcluster */
if (lcn != headlcn)
break;
@@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
if (err)
goto unmap_out;
- map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
switch (m.type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- if (endoff >= m.clusterofs)
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
break;
}
@@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode,
map->m_llen = end - map->m_la;
map->m_pa = blknr_to_addr(m.pblk);
- map->m_flags |= EROFS_MAP_MAPPED;
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto out;
- if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+ map->m_algorithmformat = vi->z_algorithmtype[1];
+ else
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+
+ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+ ((flags & EROFS_GET_BLOCKS_READMORE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+ map->m_llen >= EROFS_BLKSIZ)) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
diff --git a/include/linux/xz.h b/include/linux/xz.h
index 9884c8440188..7285ca5d56e9 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -234,6 +234,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
XZ_EXTERN void xz_dec_end(struct xz_dec *s);
/*
+ * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
+ * See xz_dec_microlzma_alloc() below for details.
+ *
+ * These functions aren't used or available in preboot code and thus aren't
+ * marked with XZ_EXTERN. This avoids warnings about static functions that
+ * are never defined.
+ */
+/**
+ * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
+ */
+struct xz_dec_microlzma;
+
+/**
+ * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
+ * @mode XZ_SINGLE or XZ_PREALLOC
+ * @dict_size LZMA dictionary size. This must be at least 4 KiB and
+ * at most 3 GiB.
+ *
+ * In contrast to xz_dec_init(), this function only allocates the memory
+ * and remembers the dictionary size. xz_dec_microlzma_reset() must be used
+ * before calling xz_dec_microlzma_run().
+ *
+ * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE.
+ * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated.
+ *
+ * On success, xz_dec_microlzma_alloc() returns a pointer to
+ * struct xz_dec_microlzma. If memory allocation fails or
+ * dict_size is invalid, NULL is returned.
+ *
+ * The compressed format supported by this decoder is a raw LZMA stream
+ * whose first byte (always 0x00) has been replaced with bitwise-negation
+ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
+ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
+ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
+ * marker must not be used. The unused values are reserved for future use.
+ * This MicroLZMA header format was created for use in EROFS but may be used
+ * by others too.
+ */
+extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+ uint32_t dict_size);
+
+/**
+ * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
+ * @s Decoder state allocated using xz_dec_microlzma_alloc()
+ * @comp_size Compressed size of the input stream
+ * @uncomp_size Uncompressed size of the input stream. A value smaller
+ * than the real uncompressed size of the input stream can
+ * be specified if uncomp_size_is_exact is set to false.
+ * uncomp_size can never be set to a value larger than the
+ * expected real uncompressed size because it would eventually
+ * result in XZ_DATA_ERROR.
+ * @uncomp_size_is_exact This is an int instead of bool to avoid
+ * requiring stdbool.h. This should normally be set to true.
+ * When this is set to false, error detection is weaker.
+ */
+extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
+ uint32_t comp_size, uint32_t uncomp_size,
+ int uncomp_size_is_exact);
+
+/**
+ * xz_dec_microlzma_run() - Run the MicroLZMA decoder
+ * @s Decoder state initialized using xz_dec_microlzma_reset()
+ * @b: Input and output buffers
+ *
+ * This works similarly to xz_dec_run() with a few important differences.
+ * Only the differences are documented here.
+ *
+ * The only possible return values are XZ_OK, XZ_STREAM_END, and
+ * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress
+ * is possible due to lack of input data or output space, this function will
+ * keep returning XZ_OK. Thus, the calling code must be written so that it
+ * will eventually provide input and output space matching (or exceeding)
+ * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset().
+ * If the caller cannot do this (for example, if the input file is truncated
+ * or otherwise corrupt), the caller must detect this error by itself to
+ * avoid an infinite loop.
+ *
+ * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned.
+ * This can happen also when incorrect dictionary, uncompressed, or
+ * compressed sizes have been specified.
+ *
+ * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over
+ * uncompressed data. This way the caller doesn't need to provide a temporary
+ * output buffer for the bytes that will be ignored.
+ *
+ * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK
+ * is also possible and thus XZ_SINGLE is actually a limited multi-call mode.
+ * After XZ_OK the bytes decoded so far may be read from the output buffer.
+ * It is possible to continue decoding but the variables b->out and b->out_pos
+ * MUST NOT be changed by the caller. Increasing the value of b->out_size is
+ * allowed to make more output space available; one doesn't need to provide
+ * space for the whole uncompressed data on the first call. The input buffer
+ * may be changed normally like with XZ_PREALLOC. This way input data can be
+ * provided from non-contiguous memory.
+ */
+extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
+ struct xz_buf *b);
+
+/**
+ * xz_dec_microlzma_end() - Free the memory allocated for the decoder state
+ * @s: Decoder state allocated using xz_dec_microlzma_alloc().
+ * If s is NULL, this function does nothing.
+ */
+extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
+
+/*
* Standalone build (userspace build or in-kernel build for boot time use)
* needs a CRC32 implementation. For normal in-kernel use, kernel's own
* CRC32 module is used instead, and users of this module don't need to
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index db4f2cec8360..16ae7b666810 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -24,7 +24,7 @@ struct erofs_map_blocks;
#define show_mflags(flags) __print_flags(flags, "", \
{ EROFS_MAP_MAPPED, "M" }, \
{ EROFS_MAP_META, "I" }, \
- { EROFS_MAP_ZIPPED, "Z" })
+ { EROFS_MAP_ENCODED, "E" })
TRACE_EVENT(erofs_lookup,
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index a2f38e23004a..9f4262ee33a5 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -20,8 +20,8 @@
*
* The worst case for in-place decompression is that the beginning of
* the file is compressed extremely well, and the rest of the file is
- * uncompressible. Thus, we must look for worst-case expansion when the
- * compressor is encoding uncompressible data.
+ * incompressible. Thus, we must look for worst-case expansion when the
+ * compressor is encoding incompressible data.
*
* The structure of the .xz file in case of a compressed kernel is as follows.
* Sizes (as bytes) of the fields are in parenthesis.
@@ -58,7 +58,7 @@
* uncompressed size of the payload is in practice never less than the
* payload size itself. The LZMA2 format would allow uncompressed size
* to be less than the payload size, but no sane compressor creates such
- * files. LZMA2 supports storing uncompressible data in uncompressed form,
+ * files. LZMA2 supports storing incompressible data in uncompressed form,
* so there's never a need to create payloads whose uncompressed size is
* smaller than the compressed size.
*
@@ -167,8 +167,8 @@
* memeq and memzero are not used much and any remotely sane implementation
* is fast enough. memcpy/memmove speed matters in multi-call mode, but
* the kernel image is decompressed in single-call mode, in which only
- * memcpy speed can matter and only if there is a lot of uncompressible data
- * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the
+ * memmove speed can matter and only if there is a lot of incompressible data
+ * (LZMA2 stores incompressible chunks in uncompressed form). Thus, the
* functions below should just be kept small; it's probably not worth
* optimizing for speed.
*/
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
index 5cb50245a878..adce22ac18d6 100644
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -39,6 +39,19 @@ config XZ_DEC_SPARC
default y
select XZ_DEC_BCJ
+config XZ_DEC_MICROLZMA
+ bool "MicroLZMA decoder"
+ default n
+ help
+ MicroLZMA is a header format variant where the first byte
+ of a raw LZMA stream (without the end of stream marker) has
+ been replaced with a bitwise-negation of the lc/lp/pb
+ properties byte. MicroLZMA was created to be used in EROFS
+ but can be used by other things too where wasting minimal
+ amount of space for headers is important.
+
+ Unless you know that you need this, say N.
+
endif
config XZ_DEC_BCJ
diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
index 7a6781e3f47b..27ce34520e78 100644
--- a/lib/xz/xz_dec_lzma2.c
+++ b/lib/xz/xz_dec_lzma2.c
@@ -248,6 +248,10 @@ struct lzma2_dec {
* before the first LZMA chunk.
*/
bool need_props;
+
+#ifdef XZ_DEC_MICROLZMA
+ bool pedantic_microlzma;
+#endif
};
struct xz_dec_lzma2 {
@@ -387,7 +391,14 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
*left -= copy_size;
- memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
+ /*
+ * If doing in-place decompression in single-call mode and the
+ * uncompressed size of the file is larger than the caller
+ * thought (i.e. it is invalid input!), the buffers below may
+ * overlap and cause undefined behavior with memcpy().
+ * With valid inputs memcpy() would be fine here.
+ */
+ memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size);
dict->pos += copy_size;
if (dict->full < dict->pos)
@@ -397,7 +408,11 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
if (dict->pos == dict->end)
dict->pos = 0;
- memcpy(b->out + b->out_pos, b->in + b->in_pos,
+ /*
+ * Like above but for multi-call mode: use memmove()
+ * to avoid undefined behavior with invalid input.
+ */
+ memmove(b->out + b->out_pos, b->in + b->in_pos,
copy_size);
}
@@ -408,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
}
}
+#ifdef XZ_DEC_MICROLZMA
+# define DICT_FLUSH_SUPPORTS_SKIPPING true
+#else
+# define DICT_FLUSH_SUPPORTS_SKIPPING false
+#endif
+
/*
* Flush pending data from dictionary to b->out. It is assumed that there is
* enough space in b->out. This is guaranteed because caller uses dict_limit()
@@ -421,8 +442,19 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b)
if (dict->pos == dict->end)
dict->pos = 0;
- memcpy(b->out + b->out_pos, dict->buf + dict->start,
- copy_size);
+ /*
+ * These buffers cannot overlap even if doing in-place
+ * decompression because in multi-call mode dict->buf
+ * has been allocated by us in this file; it's not
+ * provided by the caller like in single-call mode.
+ *
+ * With MicroLZMA, b->out can be NULL to skip bytes that
+ * the caller doesn't need. This cannot be done with XZ
+ * because it would break BCJ filters.
+ */
+ if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL)
+ memcpy(b->out + b->out_pos, dict->buf + dict->start,
+ copy_size);
}
dict->start = dict->pos;
@@ -488,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc)
* functions so that the compiler is supposed to be able to more easily avoid
* an extra branch. In this particular version of the LZMA decoder, this
* doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3
- * on x86). Using a non-splitted version results in nicer looking code too.
+ * on x86). Using a non-split version results in nicer looking code too.
*
* NOTE: This must return an int. Do not make it return a bool or the speed
* of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care,
@@ -774,6 +806,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s)
s->lzma.rep1 = 0;
s->lzma.rep2 = 0;
s->lzma.rep3 = 0;
+ s->lzma.len = 0;
/*
* All probabilities are initialized to the same value. This hack
@@ -1157,8 +1190,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props)
}
}
- s->lzma.len = 0;
-
s->lzma2.sequence = SEQ_CONTROL;
s->lzma2.need_dict_reset = true;
@@ -1174,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s)
kfree(s);
}
+
+#ifdef XZ_DEC_MICROLZMA
+/* This is a wrapper struct to have a nice struct name in the public API. */
+struct xz_dec_microlzma {
+ struct xz_dec_lzma2 s;
+};
+
+enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr,
+ struct xz_buf *b)
+{
+ struct xz_dec_lzma2 *s = &s_ptr->s;
+
+ /*
+ * sequence is SEQ_PROPERTIES before the first input byte,
+ * SEQ_LZMA_PREPARE until a total of five bytes have been read,
+ * and SEQ_LZMA_RUN for the rest of the input stream.
+ */
+ if (s->lzma2.sequence != SEQ_LZMA_RUN) {
+ if (s->lzma2.sequence == SEQ_PROPERTIES) {
+ /* One byte is needed for the props. */
+ if (b->in_pos >= b->in_size)
+ return XZ_OK;
+
+ /*
+ * Don't increment b->in_pos here. The same byte is
+ * also passed to rc_read_init() which will ignore it.
+ */
+ if (!lzma_props(s, ~b->in[b->in_pos]))
+ return XZ_DATA_ERROR;
+
+ s->lzma2.sequence = SEQ_LZMA_PREPARE;
+ }
+
+ /*
+ * xz_dec_microlzma_reset() doesn't validate the compressed
+ * size so we do it here. We have to limit the maximum size
+ * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice
+ * round number and much more than users of this code should
+ * ever need.
+ */
+ if (s->lzma2.compressed < RC_INIT_BYTES
+ || s->lzma2.compressed > (3U << 30))
+ return XZ_DATA_ERROR;
+
+ if (!rc_read_init(&s->rc, b))
+ return XZ_OK;
+
+ s->lzma2.compressed -= RC_INIT_BYTES;
+ s->lzma2.sequence = SEQ_LZMA_RUN;
+
+ dict_reset(&s->dict, b);
+ }
+
+ /* This is to allow increasing b->out_size between calls. */
+ if (DEC_IS_SINGLE(s->dict.mode))
+ s->dict.end = b->out_size - b->out_pos;
+
+ while (true) {
+ dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos,
+ s->lzma2.uncompressed));
+
+ if (!lzma2_lzma(s, b))
+ return XZ_DATA_ERROR;
+
+ s->lzma2.uncompressed -= dict_flush(&s->dict, b);
+
+ if (s->lzma2.uncompressed == 0) {
+ if (s->lzma2.pedantic_microlzma) {
+ if (s->lzma2.compressed > 0 || s->lzma.len > 0
+ || !rc_is_finished(&s->rc))
+ return XZ_DATA_ERROR;
+ }
+
+ return XZ_STREAM_END;
+ }
+
+ if (b->out_pos == b->out_size)
+ return XZ_OK;
+
+ if (b->in_pos == b->in_size
+ && s->temp.size < s->lzma2.compressed)
+ return XZ_OK;
+ }
+}
+
+struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+ uint32_t dict_size)
+{
+ struct xz_dec_microlzma *s;
+
+ /* Restrict dict_size to the same range as in the LZMA2 code. */
+ if (dict_size < 4096 || dict_size > (3U << 30))
+ return NULL;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (s == NULL)
+ return NULL;
+
+ s->s.dict.mode = mode;
+ s->s.dict.size = dict_size;
+
+ if (DEC_IS_MULTI(mode)) {
+ s->s.dict.end = dict_size;
+
+ s->s.dict.buf = vmalloc(dict_size);
+ if (s->s.dict.buf == NULL) {
+ kfree(s);
+ return NULL;
+ }
+ }
+
+ return s;
+}
+
+void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size,
+ uint32_t uncomp_size, int uncomp_size_is_exact)
+{
+ /*
+ * comp_size is validated in xz_dec_microlzma_run().
+ * uncomp_size can safely be anything.
+ */
+ s->s.lzma2.compressed = comp_size;
+ s->s.lzma2.uncompressed = uncomp_size;
+ s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact;
+
+ s->s.lzma2.sequence = SEQ_PROPERTIES;
+ s->s.temp.size = 0;
+}
+
+void xz_dec_microlzma_end(struct xz_dec_microlzma *s)
+{
+ if (DEC_IS_MULTI(s->s.dict.mode))
+ vfree(s->s.dict.buf);
+
+ kfree(s);
+}
+#endif
diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c
index fea86deaaa01..683570b93a8c 100644
--- a/lib/xz/xz_dec_stream.c
+++ b/lib/xz/xz_dec_stream.c
@@ -402,12 +402,12 @@ static enum xz_ret dec_stream_header(struct xz_dec *s)
* we will accept other check types too, but then the check won't
* be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given.
*/
+ if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX)
+ return XZ_OPTIONS_ERROR;
+
s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1];
#ifdef XZ_DEC_ANY_CHECK
- if (s->check_type > XZ_CHECK_MAX)
- return XZ_OPTIONS_ERROR;
-
if (s->check_type > XZ_CHECK_CRC32)
return XZ_UNSUPPORTED_CHECK;
#else
diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c
index 32eb3c03aede..61098c67a413 100644
--- a/lib/xz/xz_dec_syms.c
+++ b/lib/xz/xz_dec_syms.c
@@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset);
EXPORT_SYMBOL(xz_dec_run);
EXPORT_SYMBOL(xz_dec_end);
+#ifdef CONFIG_XZ_DEC_MICROLZMA
+EXPORT_SYMBOL(xz_dec_microlzma_alloc);
+EXPORT_SYMBOL(xz_dec_microlzma_reset);
+EXPORT_SYMBOL(xz_dec_microlzma_run);
+EXPORT_SYMBOL(xz_dec_microlzma_end);
+#endif
+
MODULE_DESCRIPTION("XZ decompressor");
-MODULE_VERSION("1.0");
+MODULE_VERSION("1.1");
MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov");
/*
diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h
index 09360ebb510e..bf1e94ec7873 100644
--- a/lib/xz/xz_private.h
+++ b/lib/xz/xz_private.h
@@ -37,6 +37,9 @@
# ifdef CONFIG_XZ_DEC_SPARC
# define XZ_DEC_SPARC
# endif
+# ifdef CONFIG_XZ_DEC_MICROLZMA
+# define XZ_DEC_MICROLZMA
+# endif
# define memeq(a, b, size) (memcmp(a, b, size) == 0)
# define memzero(buf, size) memset(buf, 0, size)
# endif