diff options
Diffstat (limited to 'drivers/staging/erofs')
-rw-r--r-- | drivers/staging/erofs/Kconfig | 141 | ||||
-rw-r--r-- | drivers/staging/erofs/Makefile | 13 | ||||
-rw-r--r-- | drivers/staging/erofs/TODO | 45 | ||||
-rw-r--r-- | drivers/staging/erofs/data.c | 385 | ||||
-rw-r--r-- | drivers/staging/erofs/dir.c | 145 | ||||
-rw-r--r-- | drivers/staging/erofs/erofs_fs.h | 266 | ||||
-rw-r--r-- | drivers/staging/erofs/include/linux/tagptr.h | 110 | ||||
-rw-r--r-- | drivers/staging/erofs/include/trace/events/erofs.h | 240 | ||||
-rw-r--r-- | drivers/staging/erofs/inode.c | 283 | ||||
-rw-r--r-- | drivers/staging/erofs/internal.h | 556 | ||||
-rw-r--r-- | drivers/staging/erofs/lz4defs.h | 227 | ||||
-rw-r--r-- | drivers/staging/erofs/namei.c | 251 | ||||
-rw-r--r-- | drivers/staging/erofs/super.c | 649 | ||||
-rw-r--r-- | drivers/staging/erofs/unzip_lz4.c | 251 | ||||
-rw-r--r-- | drivers/staging/erofs/unzip_pagevec.h | 172 | ||||
-rw-r--r-- | drivers/staging/erofs/unzip_vle.c | 1656 | ||||
-rw-r--r-- | drivers/staging/erofs/unzip_vle.h | 239 | ||||
-rw-r--r-- | drivers/staging/erofs/unzip_vle_lz4.c | 209 | ||||
-rw-r--r-- | drivers/staging/erofs/utils.c | 271 | ||||
-rw-r--r-- | drivers/staging/erofs/xattr.c | 577 | ||||
-rw-r--r-- | drivers/staging/erofs/xattr.h | 93 |
21 files changed, 6779 insertions, 0 deletions
diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig new file mode 100644 index 000000000000..96f614934df1 --- /dev/null +++ b/drivers/staging/erofs/Kconfig @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: GPL-2.0 + +config EROFS_FS + tristate "EROFS filesystem support" + depends on BROKEN + help + EROFS(Enhanced Read-Only File System) is a lightweight + read-only file system with modern designs (eg. page-sized + blocks, inline xattrs/data, etc.) for scenarios which need + high-performance read-only requirements, eg. firmwares in + mobile phone or LIVECDs. + + It also provides VLE compression support, focusing on + random read improvements, keeping relatively lower + compression ratios, which is useful for high-performance + devices with limited memory and ROM space. + + If unsure, say N. + +config EROFS_FS_DEBUG + bool "EROFS debugging feature" + depends on EROFS_FS + help + Print EROFS debugging messages and enable more BUG_ONs + which check the filesystem consistency aggressively. + + For daily use, say N. + +config EROFS_FS_XATTR + bool "EROFS extended attributes" + depends on EROFS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EROFS_FS_POSIX_ACL + bool "EROFS Access Control Lists" + depends on EROFS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N. + +config EROFS_FS_SECURITY + bool "EROFS Security Labels" + depends on EROFS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the erofs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config EROFS_FS_USE_VM_MAP_RAM + bool "EROFS VM_MAP_RAM Support" + depends on EROFS_FS + help + use vm_map_ram/vm_unmap_ram instead of vmap/vunmap. + + If you don't know what these are, say N. + +config EROFS_FAULT_INJECTION + bool "EROFS fault injection facility" + depends on EROFS_FS + help + Test EROFS to inject faults such as ENOMEM, EIO, and so on. + If unsure, say N. + +config EROFS_FS_ZIP + bool "EROFS Data Compresssion Support" + depends on EROFS_FS + help + Currently we support VLE Compression only. + Play at your own risk. + + If you don't want to use compression feature, say N. + +config EROFS_FS_CLUSTER_PAGE_LIMIT + int "EROFS Cluster Pages Hard Limit" + depends on EROFS_FS_ZIP + range 1 256 + default "1" + help + Indicates VLE compressed pages hard limit of a + compressed cluster. + + For example, if files of a image are compressed + into 8k-unit, the hard limit should not be less + than 2. Otherwise, the image cannot be mounted + correctly on this kernel. + +choice + prompt "EROFS VLE Data Decompression mode" + depends on EROFS_FS_ZIP + default EROFS_FS_ZIP_CACHE_BIPOLAR + help + EROFS supports three options for VLE decompression. + "In-place Decompression Only" consumes the minimum memory + with lowest random read. + + "Bipolar Cached Decompression" consumes the maximum memory + with highest random read. + + If unsure, select "Bipolar Cached Decompression" + +config EROFS_FS_ZIP_NO_CACHE + bool "In-place Decompression Only" + help + Read compressed data into page cache and do in-place + decompression directly. + +config EROFS_FS_ZIP_CACHE_UNIPOLAR + bool "Unipolar Cached Decompression" + help + For each request, it caches the last compressed page + for further reading. + It still decompresses in place for the rest compressed pages. + +config EROFS_FS_ZIP_CACHE_BIPOLAR + bool "Bipolar Cached Decompression" + help + For each request, it caches the both end compressed pages + for further reading. + It still decompresses in place for the rest compressed pages. + + Recommended for performance priority. + +endchoice + diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile new file mode 100644 index 000000000000..9a766eb7ed75 --- /dev/null +++ b/drivers/staging/erofs/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 + +EROFS_VERSION = "1.0pre1" + +ccflags-y += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\" + +obj-$(CONFIG_EROFS_FS) += erofs.o +# staging requirement: to be self-contained in its own directory +ccflags-y += -I$(src)/include +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o +erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_lz4.o unzip_vle_lz4.o + diff --git a/drivers/staging/erofs/TODO b/drivers/staging/erofs/TODO new file mode 100644 index 000000000000..f99ddb842f99 --- /dev/null +++ b/drivers/staging/erofs/TODO @@ -0,0 +1,45 @@ + +EROFS is still working in progress, thus it is not suitable +for all productive uses. play at your own risk :) + +TODO List: + - add the missing error handling code + (mainly existed in xattr and decompression submodules); + + - finalize erofs ondisk format design (which means that + minor on-disk revisions could happen later); + + - documentation and detailed technical analysis; + + - general code review and clean up + (including confusing variable names and code snippets); + + - support larger compressed clustersizes for selection + (currently erofs only works as expected with the page-sized + compressed cluster configuration, usually 4KB); + + - support more lossless data compression algorithms + in addition to LZ4 algorithms in VLE approach; + + - data deduplication and other useful features. + +erofs-mkfs (preview version) binaries for i386 / x86_64 are available at: + + https://github.com/hsiangkao/erofs_mkfs_binary + +It is still in progress opening mkfs source code to public, +in any case an open-source mkfs will be released in the near future. + + +Code, suggestions, etc, are welcome. Please feel free to +ask and send patches, + +To: + linux-erofs mailing list <linux-erofs@lists.ozlabs.org> + Gao Xiang <gaoxiang25@huawei.com> + Chao Yu <yuchao0@huawei.com> + +Cc: (for linux-kernel upstream patches) + Greg Kroah-Hartman <gregkh@linuxfoundation.org> + linux-staging mailing list <devel@driverdev.osuosl.org> + diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c new file mode 100644 index 000000000000..ac263a180253 --- /dev/null +++ b/drivers/staging/erofs/data.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/data.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "internal.h" +#include <linux/prefetch.h> + +#include <trace/events/erofs.h> + +static inline void read_endio(struct bio *bio) +{ + int i; + struct bio_vec *bvec; + const blk_status_t err = bio->bi_status; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + /* page is already locked */ + BUG_ON(PageUptodate(page)); + + if (unlikely(err)) + SetPageError(page); + else + SetPageUptodate(page); + + unlock_page(page); + /* page could be reclaimed now */ + } + bio_put(bio); +} + +/* prio -- true is used for dir */ +struct page *erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct address_space *mapping = bd_inode->i_mapping; + struct page *page; + +repeat: + page = find_or_create_page(mapping, blkaddr, + /* + * Prefer looping in the allocator rather than here, + * at least that code knows what it's doing. + */ + mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL); + + BUG_ON(!page || !PageLocked(page)); + + if (!PageUptodate(page)) { + struct bio *bio; + int err; + + bio = prepare_bio(sb, blkaddr, 1, read_endio); + err = bio_add_page(bio, page, PAGE_SIZE, 0); + BUG_ON(err != PAGE_SIZE); + + __submit_bio(bio, REQ_OP_READ, + REQ_META | (prio ? REQ_PRIO : 0)); + + lock_page(page); + + /* the page has been truncated by others? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto repeat; + } + + /* more likely a read error */ + if (unlikely(!PageUptodate(page))) { + unlock_page(page); + put_page(page); + + page = ERR_PTR(-EIO); + } + } + return page; +} + +static int erofs_map_blocks_flatmode(struct inode *inode, + struct erofs_map_blocks *map, + int flags) +{ + erofs_blk_t nblocks, lastblk; + u64 offset = map->m_la; + struct erofs_vnode *vi = EROFS_V(inode); + + trace_erofs_map_blocks_flatmode_enter(inode, map, flags); + BUG_ON(is_inode_layout_compression(inode)); + + nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + lastblk = nblocks - is_inode_layout_inline(inode); + + if (unlikely(offset >= inode->i_size)) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; + } + + /* there is no hole in flatmode */ + map->m_flags = EROFS_MAP_MAPPED; + + if (offset < blknr_to_addr(lastblk)) { + map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; + map->m_plen = blknr_to_addr(lastblk) - offset; + } else if (is_inode_layout_inline(inode)) { + /* 2 - inode inline B: inode, [xattrs], inline last blk... */ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_plen = inode->i_size - offset; + + /* inline data should locate in one meta block */ + BUG_ON(erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE); + map->m_flags |= EROFS_MAP_META; + } else { + errln("internal error @ nid: %llu (size %llu), m_la 0x%llx", + vi->nid, inode->i_size, map->m_la); + BUG(); + } + +out: + map->m_llen = map->m_plen; + trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); + return 0; +} + +#ifdef CONFIG_EROFS_FS_ZIP +extern int z_erofs_map_blocks_iter(struct inode *, + struct erofs_map_blocks *, struct page **, int); +#endif + +int erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + struct page **mpage_ret, int flags) +{ + /* by default, reading raw data never use erofs_map_blocks_iter */ + if (unlikely(!is_inode_layout_compression(inode))) { + if (*mpage_ret != NULL) + put_page(*mpage_ret); + *mpage_ret = NULL; + + return erofs_map_blocks(inode, map, flags); + } + +#ifdef CONFIG_EROFS_FS_ZIP + return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags); +#else + /* data compression is not available */ + return -ENOTSUPP; +#endif +} + +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + if (unlikely(is_inode_layout_compression(inode))) { + struct page *mpage = NULL; + int err; + + err = erofs_map_blocks_iter(inode, map, &mpage, flags); + if (mpage != NULL) + put_page(mpage); + return err; + } + return erofs_map_blocks_flatmode(inode, map, flags); +} + +static inline struct bio *erofs_read_raw_page( + struct bio *bio, + struct address_space *mapping, + struct page *page, + erofs_off_t *last_block, + unsigned nblocks, + bool ra) +{ + struct inode *inode = mapping->host; + erofs_off_t current_block = (erofs_off_t)page->index; + int err; + + BUG_ON(!nblocks); + + if (PageUptodate(page)) { + err = 0; + goto has_updated; + } + + if (cleancache_get_page(page) == 0) { + err = 0; + SetPageUptodate(page); + goto has_updated; + } + + /* note that for readpage case, bio also equals to NULL */ + if (bio != NULL && + /* not continuous */ + *last_block + 1 != current_block) { +submit_bio_retry: + __submit_bio(bio, REQ_OP_READ, 0); + bio = NULL; + } + + if (bio == NULL) { + struct erofs_map_blocks map = { + .m_la = blknr_to_addr(current_block), + }; + erofs_blk_t blknr; + unsigned blkoff; + + err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (unlikely(err)) + goto err_out; + + /* zero out the holed page */ + if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* for RAW access mode, m_plen must be equal to m_llen */ + BUG_ON(map.m_plen != map.m_llen); + + blknr = erofs_blknr(map.m_pa); + blkoff = erofs_blkoff(map.m_pa); + + /* deal with inline page */ + if (map.m_flags & EROFS_MAP_META) { + void *vsrc, *vto; + struct page *ipage; + + BUG_ON(map.m_plen > PAGE_SIZE); + + ipage = erofs_get_meta_page(inode->i_sb, blknr, 0); + + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto err_out; + } + + vsrc = kmap_atomic(ipage); + vto = kmap_atomic(page); + memcpy(vto, vsrc + blkoff, map.m_plen); + memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); + kunmap_atomic(vto); + kunmap_atomic(vsrc); + flush_dcache_page(page); + + SetPageUptodate(page); + /* TODO: could we unlock the page earlier? */ + unlock_page(ipage); + put_page(ipage); + + /* imply err = 0, see erofs_map_blocks */ + goto has_updated; + } + + /* pa must be block-aligned for raw reading */ + BUG_ON(erofs_blkoff(map.m_pa) != 0); + + /* max # of continuous pages */ + if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) + nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (nblocks > BIO_MAX_PAGES) + nblocks = BIO_MAX_PAGES; + + bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio); + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + /* out of the extent or bio is full */ + if (err < PAGE_SIZE) + goto submit_bio_retry; + + *last_block = current_block; + + /* shift in advance in case of it followed by too many gaps */ + if (unlikely(bio->bi_vcnt >= bio->bi_max_vecs)) { + /* err should reassign to 0 after submitting */ + err = 0; + goto submit_bio_out; + } + + return bio; + +err_out: + /* for sync reading, set page error immediately */ + if (!ra) { + SetPageError(page); + ClearPageUptodate(page); + } +has_updated: + unlock_page(page); + + /* if updated manually, continuous pages has a gap */ + if (bio != NULL) +submit_bio_out: + __submit_bio(bio, REQ_OP_READ, 0); + + return unlikely(err) ? ERR_PTR(err) : NULL; +} + +/* + * since we dont have write or truncate flows, so no inode + * locking needs to be held at the moment. + */ +static int erofs_raw_access_readpage(struct file *file, struct page *page) +{ + erofs_off_t last_block; + struct bio *bio; + + trace_erofs_readpage(page, true); + + bio = erofs_read_raw_page(NULL, page->mapping, + page, &last_block, 1, false); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + BUG_ON(bio != NULL); /* since we have only one bio -- must be NULL */ + return 0; +} + +static int erofs_raw_access_readpages(struct file *filp, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + erofs_off_t last_block; + struct bio *bio = NULL; + gfp_t gfp = readahead_gfp_mask(mapping); + struct page *page = list_last_entry(pages, struct page, lru); + + trace_erofs_readpages(mapping->host, page, nr_pages, true); + + for (; nr_pages; --nr_pages) { + page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { + bio = erofs_read_raw_page(bio, mapping, page, + &last_block, nr_pages, true); + + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_V(mapping->host)->nid); + + bio = NULL; + } + } + + /* pages could still be locked */ + put_page(page); + } + BUG_ON(!list_empty(pages)); + + /* the rare case (end in gaps) */ + if (unlikely(bio != NULL)) + __submit_bio(bio, REQ_OP_READ, 0); + return 0; +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_raw_access_readpage, + .readpages = erofs_raw_access_readpages, +}; + diff --git a/drivers/staging/erofs/dir.c b/drivers/staging/erofs/dir.c new file mode 100644 index 000000000000..be6ae3b1bdbe --- /dev/null +++ b/drivers/staging/erofs/dir.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/dir.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "internal.h" + +static const unsigned char erofs_filetype_table[EROFS_FT_MAX] = { + [EROFS_FT_UNKNOWN] = DT_UNKNOWN, + [EROFS_FT_REG_FILE] = DT_REG, + [EROFS_FT_DIR] = DT_DIR, + [EROFS_FT_CHRDEV] = DT_CHR, + [EROFS_FT_BLKDEV] = DT_BLK, + [EROFS_FT_FIFO] = DT_FIFO, + [EROFS_FT_SOCK] = DT_SOCK, + [EROFS_FT_SYMLINK] = DT_LNK, +}; + +static int erofs_fill_dentries(struct dir_context *ctx, + void *dentry_blk, unsigned *ofs, + unsigned nameoff, unsigned maxsize) +{ + struct erofs_dirent *de = dentry_blk; + const struct erofs_dirent *end = dentry_blk + nameoff; + + de = dentry_blk + *ofs; + while (de < end) { + const char *de_name; + int de_namelen; + unsigned char d_type; +#ifdef CONFIG_EROFS_FS_DEBUG + unsigned dbg_namelen; + unsigned char dbg_namebuf[EROFS_NAME_LEN]; +#endif + + if (unlikely(de->file_type < EROFS_FT_MAX)) + d_type = erofs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + + nameoff = le16_to_cpu(de->nameoff); + de_name = (char *)dentry_blk + nameoff; + + de_namelen = unlikely(de + 1 >= end) ? + /* last directory entry */ + strnlen(de_name, maxsize - nameoff) : + le16_to_cpu(de[1].nameoff) - nameoff; + + /* the corrupted directory found */ + BUG_ON(de_namelen < 0); + +#ifdef CONFIG_EROFS_FS_DEBUG + dbg_namelen = min(EROFS_NAME_LEN - 1, de_namelen); + memcpy(dbg_namebuf, de_name, dbg_namelen); + dbg_namebuf[dbg_namelen] = '\0'; + + debugln("%s, found de_name %s de_len %d d_type %d", __func__, + dbg_namebuf, de_namelen, d_type); +#endif + + if (!dir_emit(ctx, de_name, de_namelen, + le64_to_cpu(de->nid), d_type)) + /* stoped by some reason */ + return 1; + ++de; + *ofs += sizeof(struct erofs_dirent); + } + *ofs = maxsize; + return 0; +} + +static int erofs_readdir(struct file *f, struct dir_context *ctx) +{ + struct inode *dir = file_inode(f); + struct address_space *mapping = dir->i_mapping; + const size_t dirsize = i_size_read(dir); + unsigned i = ctx->pos / EROFS_BLKSIZ; + unsigned ofs = ctx->pos % EROFS_BLKSIZ; + int err = 0; + bool initial = true; + + while (ctx->pos < dirsize) { + struct page *dentry_page; + struct erofs_dirent *de; + unsigned nameoff, maxsize; + + dentry_page = read_mapping_page(mapping, i, NULL); + if (IS_ERR(dentry_page)) + continue; + + lock_page(dentry_page); + de = (struct erofs_dirent *)kmap(dentry_page); + + nameoff = le16_to_cpu(de->nameoff); + + if (unlikely(nameoff < sizeof(struct erofs_dirent) || + nameoff >= PAGE_SIZE)) { + errln("%s, invalid de[0].nameoff %u", + __func__, nameoff); + + err = -EIO; + goto skip_this; + } + + maxsize = min_t(unsigned, dirsize - ctx->pos + ofs, PAGE_SIZE); + + /* search dirents at the arbitrary position */ + if (unlikely(initial)) { + initial = false; + + ofs = roundup(ofs, sizeof(struct erofs_dirent)); + if (unlikely(ofs >= nameoff)) + goto skip_this; + } + + err = erofs_fill_dentries(ctx, de, &ofs, nameoff, maxsize); +skip_this: + kunmap(dentry_page); + + unlock_page(dentry_page); + put_page(dentry_page); + + ctx->pos = blknr_to_addr(i) + ofs; + + if (unlikely(err)) + break; + ++i; + ofs = 0; + } + return err < 0 ? err : 0; +} + +const struct file_operations erofs_dir_fops = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = erofs_readdir, +}; + diff --git a/drivers/staging/erofs/erofs_fs.h b/drivers/staging/erofs/erofs_fs.h new file mode 100644 index 000000000000..2f8e2bf70941 --- /dev/null +++ b/drivers/staging/erofs/erofs_fs.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Apache-2.0 + * + * linux/drivers/staging/erofs/erofs_fs.h + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is dual-licensed; you may select either the GNU General Public + * License version 2 or Apache License, Version 2.0. See the file COPYING + * in the main directory of the Linux distribution for more details. + */ +#ifndef __EROFS_FS_H +#define __EROFS_FS_H + +/* Enhanced(Extended) ROM File System */ +#define EROFS_SUPER_MAGIC_V1 0xE0F5E1E2 +#define EROFS_SUPER_OFFSET 1024 + +struct erofs_super_block { +/* 0 */__le32 magic; /* in the little endian */ +/* 4 */__le32 checksum; /* crc32c(super_block) */ +/* 8 */__le32 features; +/* 12 */__u8 blkszbits; /* support block_size == PAGE_SIZE only */ +/* 13 */__u8 reserved; + +/* 14 */__le16 root_nid; +/* 16 */__le64 inos; /* total valid ino # (== f_files - f_favail) */ + +/* 24 */__le64 build_time; /* inode v1 time derivation */ +/* 32 */__le32 build_time_nsec; +/* 36 */__le32 blocks; /* used for statfs */ +/* 40 */__le32 meta_blkaddr; +/* 44 */__le32 xattr_blkaddr; +/* 48 */__u8 uuid[16]; /* 128-bit uuid for volume */ +/* 64 */__u8 volume_name[16]; /* volume name */ + +/* 80 */__u8 reserved2[48]; /* 128 bytes */ +} __packed; + +#define __EROFS_BIT(_prefix, _cur, _pre) enum { \ + _prefix ## _cur ## _BIT = _prefix ## _pre ## _BIT + \ + _prefix ## _pre ## _BITS } + +/* + * erofs inode data mapping: + * 0 - inode plain without inline data A: + * inode, [xattrs], ... | ... | no-holed data + * 1 - inode VLE compression B: + * inode, [xattrs], extents ... | ... + * 2 - inode plain with inline data C: + * inode, [xattrs], last_inline_data, ... | ... | no-holed data + * 3~7 - reserved + */ +enum { + EROFS_INODE_LAYOUT_PLAIN, + EROFS_INODE_LAYOUT_COMPRESSION, + EROFS_INODE_LAYOUT_INLINE, + EROFS_INODE_LAYOUT_MAX +}; +#define EROFS_I_VERSION_BITS 1 +#define EROFS_I_DATA_MAPPING_BITS 3 + +#define EROFS_I_VERSION_BIT 0 +__EROFS_BIT(EROFS_I_, DATA_MAPPING, VERSION); + +struct erofs_inode_v1 { +/* 0 */__le16 i_advise; + +/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ +/* 2 */__le16 i_xattr_icount; +/* 4 */__le16 i_mode; +/* 6 */__le16 i_nlink; +/* 8 */__le32 i_size; +/* 12 */__le32 i_reserved; +/* 16 */union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u __packed; +/* 20 */__le32 i_ino; /* only used for 32-bit stat compatibility */ +/* 24 */__le16 i_uid; +/* 26 */__le16 i_gid; +/* 28 */__le32 i_checksum; +} __packed; + +/* 32 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V1 0 +/* 64 bytes on-disk inode */ +#define EROFS_INODE_LAYOUT_V2 1 + +struct erofs_inode_v2 { + __le16 i_advise; + + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ + __le16 i_xattr_icount; + __le16 i_mode; + __le16 i_reserved; /* 8 bytes */ + __le64 i_size; /* 16 bytes */ + union { + /* file total compressed blocks for data mapping 1 */ + __le32 compressed_blocks; + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + } i_u __packed; + + /* only used for 32-bit stat compatibility */ + __le32 i_ino; /* 24 bytes */ + + __le32 i_uid; + __le32 i_gid; + __le64 i_ctime; /* 32 bytes */ + __le32 i_ctime_nsec; + __le32 i_nlink; + __u8 i_reserved2[12]; + __le32 i_checksum; /* 64 bytes */ +} __packed; + +#define EROFS_MAX_SHARED_XATTRS (128) +/* h_shared_count between 129 ... 255 are special # */ +#define EROFS_SHARED_XATTR_EXTENT (255) + +/* + * inline xattrs (n == i_xattr_icount): + * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes + * 12 bytes / \ + * / \ + * /-----------------------\ + * | erofs_xattr_entries+ | + * +-----------------------+ + * inline xattrs must starts in erofs_xattr_ibody_header, + * for read-only fs, no need to introduce h_refcount + */ +struct erofs_xattr_ibody_header { + __le32 h_checksum; + __u8 h_shared_count; + __u8 h_reserved[7]; + __le32 h_shared_xattrs[0]; /* shared xattr id array */ +} __packed; + +/* Name indexes */ +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 + +/* xattr entry (for both inline & shared xattrs) */ +struct erofs_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_size; /* size of attribute value */ + /* followed by e_name and e_value */ + char e_name[0]; /* attribute name */ +} __packed; + +#define ondisk_xattr_ibody_size(count) ({\ + u32 __count = le16_to_cpu(count); \ + ((__count) == 0) ? 0 : \ + sizeof(struct erofs_xattr_ibody_header) + \ + sizeof(__u32) * ((__count) - 1); }) + +#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) +#define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \ + sizeof(struct erofs_xattr_entry) + \ + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)) + +/* have to be aligned with 8 bytes on disk */ +struct erofs_extent_header { + __le32 eh_checksum; + __le32 eh_reserved[3]; +} __packed; + +/* + * Z_EROFS Variable-sized Logical Extent cluster type: + * 0 - literal (uncompressed) cluster + * 1 - compressed cluster (for the head logical cluster) + * 2 - compressed cluster (for the other logical clusters) + * + * In detail, + * 0 - literal (uncompressed) cluster, + * di_advise = 0 + * di_clusterofs = the literal data offset of the cluster + * di_blkaddr = the blkaddr of the literal cluster + * + * 1 - compressed cluster (for the head logical cluster) + * di_advise = 1 + * di_clusterofs = the decompressed data offset of the cluster + * di_blkaddr = the blkaddr of the compressed cluster + * + * 2 - compressed cluster (for the other logical clusters) + * di_advise = 2 + * di_clusterofs = + * the decompressed data offset in its own head cluster + * di_u.delta[0] = distance to its corresponding head cluster + * di_u.delta[1] = distance to its corresponding tail cluster + * (di_advise could be 0, 1 or 2) + */ +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 +#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 + +struct z_erofs_vle_decompressed_index { + __le16 di_advise; + /* where to decompress in the head cluster */ + __le16 di_clusterofs; + + union { + /* for the head cluster */ + __le32 blkaddr; + /* + * for the rest clusters + * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) + * [0] - pointing to the head cluster + * [1] - pointing to the tail cluster + */ + __le16 delta[2]; + } di_u __packed; /* 8 bytes */ +} __packed; + +#define Z_EROFS_VLE_EXTENT_ALIGN(size) round_up(size, \ + sizeof(struct z_erofs_vle_decompressed_index)) + +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* 0, node number */ + __le16 nameoff; /* 8, start offset of file name */ + __u8 file_type; /* 10, file type */ + __u8 reserved; /* 11, reserved */ +} __packed; + +/* file types used in inode_info->flags */ +enum { + EROFS_FT_UNKNOWN, + EROFS_FT_REG_FILE, + EROFS_FT_DIR, + EROFS_FT_CHRDEV, + EROFS_FT_BLKDEV, + EROFS_FT_FIFO, + EROFS_FT_SOCK, + EROFS_FT_SYMLINK, + EROFS_FT_MAX +}; + +#define EROFS_NAME_LEN 255 + +/* check the EROFS on-disk layout strictly at compile time */ +static inline void erofs_check_ondisk_layout_definitions(void) +{ + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32); + BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64); + BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); + BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct erofs_extent_header) != 16); + BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); +} + +#endif + diff --git a/drivers/staging/erofs/include/linux/tagptr.h b/drivers/staging/erofs/include/linux/tagptr.h new file mode 100644 index 000000000000..ccd106dbd48e --- /dev/null +++ b/drivers/staging/erofs/include/linux/tagptr.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Tagged pointer implementation + * + * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> + */ +#ifndef _LINUX_TAGPTR_H +#define _LINUX_TAGPTR_H + +#include <linux/types.h> +#include <linux/build_bug.h> + +/* + * the name of tagged pointer types are tagptr{1, 2, 3...}_t + * avoid directly using the internal structs __tagptr{1, 2, 3...} + */ +#define __MAKE_TAGPTR(n) \ +typedef struct __tagptr##n { \ + uintptr_t v; \ +} tagptr##n##_t; + +__MAKE_TAGPTR(1) +__MAKE_TAGPTR(2) +__MAKE_TAGPTR(3) +__MAKE_TAGPTR(4) + +#undef __MAKE_TAGPTR + +extern void __compiletime_error("bad tagptr tags") + __bad_tagptr_tags(void); + +extern void __compiletime_error("bad tagptr type") + __bad_tagptr_type(void); + +/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ +#define __tagptr_mask_1(ptr, n) \ + __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ + (1UL << (n)) - 1 : + +#define __tagptr_mask(ptr) (\ + __tagptr_mask_1(ptr, 1) ( \ + __tagptr_mask_1(ptr, 2) ( \ + __tagptr_mask_1(ptr, 3) ( \ + __tagptr_mask_1(ptr, 4) ( \ + __bad_tagptr_type(), 0))))) + +/* generate a tagged pointer from a raw value */ +#define tagptr_init(type, val) \ + ((typeof(type)){ .v = (uintptr_t)(val) }) + +/* + * directly cast a tagged pointer to the native pointer type, which + * could be used for backward compatibility of existing code. + */ +#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) + +/* encode tagged pointers */ +#define tagptr_fold(type, ptr, _tags) ({ \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ + __bad_tagptr_tags(); \ +tagptr_init(type, (uintptr_t)(ptr) | tags); }) + +/* decode tagged pointers */ +#define tagptr_unfold_ptr(tptr) \ + ((void *)((tptr).v & ~__tagptr_mask(tptr))) + +#define tagptr_unfold_tags(tptr) \ + ((tptr).v & __tagptr_mask(tptr)) + +/* operations for the tagger pointer */ +#define tagptr_eq(_tptr1, _tptr2) ({ \ + typeof(_tptr1) tptr1 = (_tptr1); \ + typeof(_tptr2) tptr2 = (_tptr2); \ + (void)(&tptr1 == &tptr2); \ +(tptr1).v == (tptr2).v; }) + +/* lock-free CAS operation */ +#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + typeof(_o) o = (_o); \ + typeof(_n) n = (_n); \ + (void)(&o == &n); \ + (void)(&o == ptptr); \ +tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) + +/* wrap WRITE_ONCE if atomic update is needed */ +#define tagptr_replace_tags(_ptptr, tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ +*ptptr; }) + +#define tagptr_set_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v |= tags; \ +*ptptr; }) + +#define tagptr_clear_tags(_ptptr, _tags) ({ \ + typeof(_ptptr) ptptr = (_ptptr); \ + const typeof(_tags) tags = (_tags); \ + if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ + __bad_tagptr_tags(); \ + ptptr->v &= ~tags; \ +*ptptr; }) + +#endif + diff --git a/drivers/staging/erofs/include/trace/events/erofs.h b/drivers/staging/erofs/include/trace/events/erofs.h new file mode 100644 index 000000000000..5aead93a762f --- /dev/null +++ b/drivers/staging/erofs/include/trace/events/erofs.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM erofs + +#if !defined(_TRACE_EROFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EROFS_H + +#include <linux/tracepoint.h> + +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_nid(entry) show_dev(entry->dev), entry->nid + +#define show_file_type(type) \ + __print_symbolic(type, \ + { 0, "FILE" }, \ + { 1, "DIR" }) + +#define show_map_flags(flags) __print_flags(flags, "|", \ + { EROFS_GET_BLOCKS_RAW, "RAW" }) + +#define show_mflags(flags) __print_flags(flags, "", \ + { EROFS_MAP_MAPPED, "M" }, \ + { EROFS_MAP_META, "I" }, \ + { EROFS_MAP_ZIPPED, "Z" }) + +TRACE_EVENT(erofs_lookup, + + TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(const char *, name ) + __field(unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->nid = EROFS_V(dir)->nid; + __entry->name = dentry->d_name.name; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", + show_dev_nid(__entry), + __entry->name, + __entry->flags) +); + +TRACE_EVENT(erofs_fill_inode, + TP_PROTO(struct inode *inode, int isdir), + TP_ARGS(inode, isdir), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(erofs_blk_t, blkaddr ) + __field(unsigned int, ofs ) + __field(int, isdir ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->isdir = isdir; + ), + + TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u, isdir %d", + show_dev_nid(__entry), + __entry->blkaddr, __entry->ofs, + __entry->isdir) +); + +TRACE_EVENT(erofs_readpage, + + TP_PROTO(struct page *page, bool raw), + + TP_ARGS(page, raw), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(int, dir ) + __field(pgoff_t, index ) + __field(int, uptodate) + __field(bool, raw ) + ), + + TP_fast_assign( + __entry->dev = page->mapping->host->i_sb->s_dev; + __entry->nid = EROFS_V(page->mapping->host)->nid; + __entry->dir = S_ISDIR(page->mapping->host->i_mode); + __entry->index = page->index; + __entry->uptodate = PageUptodate(page); + __entry->raw = raw; + ), + + TP_printk("dev = (%d,%d), nid = %llu, %s, index = %lu, uptodate = %d " + "raw = %d", + show_dev_nid(__entry), + show_file_type(__entry->dir), + (unsigned long)__entry->index, + __entry->uptodate, + __entry->raw) +); + +TRACE_EVENT(erofs_readpages, + + TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage, + bool raw), + + TP_ARGS(inode, page, nrpage, raw), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(erofs_nid_t, nid ) + __field(pgoff_t, start ) + __field(unsigned int, nrpage ) + __field(bool, raw ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->start = page->index; + __entry->nrpage = nrpage; + __entry->raw = raw; + ), + + TP_printk("dev = (%d,%d), nid = %llu, start = %lu nrpage = %u raw = %d", + show_dev_nid(__entry), + (unsigned long)__entry->start, + __entry->nrpage, + __entry->raw) +); + +DECLARE_EVENT_CLASS(erofs__map_blocks_enter, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned int flags), + + TP_ARGS(inode, map, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( erofs_nid_t, nid ) + __field( erofs_off_t, la ) + __field( u64, llen ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->la = map->m_la; + __entry->llen = map->m_llen; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s", + show_dev_nid(__entry), + __entry->la, __entry->llen, show_map_flags(__entry->flags)) +); + +DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned flags), + + TP_ARGS(inode, map, flags) +); + +DECLARE_EVENT_CLASS(erofs__map_blocks_exit, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned int flags, int ret), + + TP_ARGS(inode, map, flags, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( erofs_nid_t, nid ) + __field( unsigned int, flags ) + __field( erofs_off_t, la ) + __field( erofs_off_t, pa ) + __field( u64, llen ) + __field( u64, plen ) + __field( unsigned int, mflags ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + __entry->flags = flags; + __entry->la = map->m_la; + __entry->pa = map->m_pa; + __entry->llen = map->m_llen; + __entry->plen = map->m_plen; + __entry->mflags = map->m_flags; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), nid = %llu, flags %s " + "la %llu pa %llu llen %llu plen %llu mflags %s ret %d", + show_dev_nid(__entry), show_map_flags(__entry->flags), + __entry->la, __entry->pa, __entry->llen, __entry->plen, + show_mflags(__entry->mflags), __entry->ret) +); + +DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_flatmode_exit, + TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, + unsigned flags, int ret), + + TP_ARGS(inode, map, flags, ret) +); + +TRACE_EVENT(erofs_destroy_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( erofs_nid_t, nid ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_V(inode)->nid; + ), + + TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry)) +); + +#endif /* _TRACE_EROFS_H */ + + /* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c new file mode 100644 index 000000000000..fbf6ff25cd1b --- /dev/null +++ b/drivers/staging/erofs/inode.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/inode.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "xattr.h" + +#include <trace/events/erofs.h> + +/* no locking */ +static int read_inode(struct inode *inode, void *data) +{ + struct erofs_vnode *vi = EROFS_V(inode); + struct erofs_inode_v1 *v1 = data; + const unsigned advise = le16_to_cpu(v1->i_advise); + + vi->data_mapping_mode = __inode_data_mapping(advise); + + if (unlikely(vi->data_mapping_mode >= EROFS_INODE_LAYOUT_MAX)) { + errln("unknown data mapping mode %u of nid %llu", + vi->data_mapping_mode, vi->nid); + DBG_BUGON(1); + return -EIO; + } + + if (__inode_version(advise) == EROFS_INODE_LAYOUT_V2) { + struct erofs_inode_v2 *v2 = data; + + vi->inode_isize = sizeof(struct erofs_inode_v2); + vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount); + + inode->i_mode = le16_to_cpu(v2->i_mode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + inode->i_rdev = + new_decode_dev(le32_to_cpu(v2->i_u.rdev)); + } else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_rdev = 0; + } else { + return -EIO; + } + + i_uid_write(inode, le32_to_cpu(v2->i_uid)); + i_gid_write(inode, le32_to_cpu(v2->i_gid)); + set_nlink(inode, le32_to_cpu(v2->i_nlink)); + + /* ns timestamp */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + le64_to_cpu(v2->i_ctime); + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + le32_to_cpu(v2->i_ctime_nsec); + + inode->i_size = le64_to_cpu(v2->i_size); + } else if (__inode_version(advise) == EROFS_INODE_LAYOUT_V1) { + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + + vi->inode_isize = sizeof(struct erofs_inode_v1); + vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount); + + inode->i_mode = le16_to_cpu(v1->i_mode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + inode->i_rdev = + new_decode_dev(le32_to_cpu(v1->i_u.rdev)); + } else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_rdev = 0; + } else { + return -EIO; + } + + i_uid_write(inode, le16_to_cpu(v1->i_uid)); + i_gid_write(inode, le16_to_cpu(v1->i_gid)); + set_nlink(inode, le16_to_cpu(v1->i_nlink)); + + /* use build time to derive all file time */ + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = + sbi->build_time; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = + sbi->build_time_nsec; + + inode->i_size = le32_to_cpu(v1->i_size); + } else { + errln("unsupported on-disk inode version %u of nid %llu", + __inode_version(advise), vi->nid); + DBG_BUGON(1); + return -EIO; + } + + /* measure inode.i_blocks as the generic filesystem */ + inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; + return 0; +} + +/* + * try_lock can be required since locking order is: + * file data(fs_inode) + * meta(bd_inode) + * but the majority of the callers is "iget", + * in that case we are pretty sure no deadlock since + * no data operations exist. However I tend to + * try_lock since it takes no much overhead and + * will success immediately. + */ +static int fill_inline_data(struct inode *inode, void *data, unsigned m_pofs) +{ + struct erofs_vnode *vi = EROFS_V(inode); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + int mode = vi->data_mapping_mode; + + DBG_BUGON(mode >= EROFS_INODE_LAYOUT_MAX); + + /* should be inode inline C */ + if (mode != EROFS_INODE_LAYOUT_INLINE) + return 0; + + /* fast symlink (following ext4) */ + if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) { + char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL); + + if (unlikely(lnk == NULL)) + return -ENOMEM; + + m_pofs += vi->inode_isize + vi->xattr_isize; + BUG_ON(m_pofs + inode->i_size > PAGE_SIZE); + + /* get in-page inline data */ + memcpy(lnk, data + m_pofs, inode->i_size); + lnk[inode->i_size] = '\0'; + + inode->i_link = lnk; + set_inode_fast_symlink(inode); + } + return -EAGAIN; +} + +static int fill_inode(struct inode *inode, int isdir) +{ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + struct erofs_vnode *vi = EROFS_V(inode); + struct page *page; + void *data; + int err; + erofs_blk_t blkaddr; + unsigned ofs; + + trace_erofs_fill_inode(inode, isdir); + + blkaddr = erofs_blknr(iloc(sbi, vi->nid)); + ofs = erofs_blkoff(iloc(sbi, vi->nid)); + + debugln("%s, reading inode nid %llu at %u of blkaddr %u", + __func__, vi->nid, ofs, blkaddr); + + page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir); + + if (IS_ERR(page)) { + errln("failed to get inode (nid: %llu) page, err %ld", + vi->nid, PTR_ERR(page)); + return PTR_ERR(page); + } + + BUG_ON(!PageUptodate(page)); + data = page_address(page); + + err = read_inode(inode, data + ofs); + if (!err) { + /* setup the new inode */ + if (S_ISREG(inode->i_mode)) { +#ifdef CONFIG_EROFS_FS_XATTR + if (vi->xattr_isize) + inode->i_op = &erofs_generic_xattr_iops; +#endif + inode->i_fop = &generic_ro_fops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = +#ifdef CONFIG_EROFS_FS_XATTR + vi->xattr_isize ? &erofs_dir_xattr_iops : +#endif + &erofs_dir_iops; + inode->i_fop = &erofs_dir_fops; + } else if (S_ISLNK(inode->i_mode)) { + /* by default, page_get_link is used for symlink */ + inode->i_op = +#ifdef CONFIG_EROFS_FS_XATTR + &erofs_symlink_xattr_iops, +#else + &page_symlink_inode_operations; +#endif + inode_nohighmem(inode); + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { +#ifdef CONFIG_EROFS_FS_XATTR + inode->i_op = &erofs_special_inode_operations; +#endif + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + err = -EIO; + goto out_unlock; + } + + if (is_inode_layout_compression(inode)) { +#ifdef CONFIG_EROFS_FS_ZIP + inode->i_mapping->a_ops = + &z_erofs_vle_normalaccess_aops; +#else + err = -ENOTSUPP; +#endif + goto out_unlock; + } + + inode->i_mapping->a_ops = &erofs_raw_access_aops; + + /* fill last page if inline data is available */ + fill_inline_data(inode, data, ofs); + } + +out_unlock: + unlock_page(page); + put_page(page); + return err; +} + +struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, bool isdir) +{ + struct inode *inode = iget_locked(sb, nid); + + if (unlikely(inode == NULL)) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + int err; + struct erofs_vnode *vi = EROFS_V(inode); + vi->nid = nid; + + err = fill_inode(inode, isdir); + if (likely(!err)) + unlock_new_inode(inode); + else { + iget_failed(inode); + inode = ERR_PTR(err); + } + } + return inode; +} + +#ifdef CONFIG_EROFS_FS_XATTR +const struct inode_operations erofs_generic_xattr_iops = { + .listxattr = erofs_listxattr, +}; +#endif + +#ifdef CONFIG_EROFS_FS_XATTR +const struct inode_operations erofs_symlink_xattr_iops = { + .get_link = page_get_link, + .listxattr = erofs_listxattr, +}; +#endif + +const struct inode_operations erofs_special_inode_operations = { +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif +}; + +#ifdef CONFIG_EROFS_FS_XATTR +const struct inode_operations erofs_fast_symlink_xattr_iops = { + .get_link = simple_get_link, + .listxattr = erofs_listxattr, +}; +#endif + diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h new file mode 100644 index 000000000000..367b39fe46e5 --- /dev/null +++ b/drivers/staging/erofs/internal.h @@ -0,0 +1,556 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/internal.h + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __INTERNAL_H +#define __INTERNAL_H + +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/cleancache.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "erofs_fs.h" + +/* redefine pr_fmt "erofs: " */ +#undef pr_fmt +#define pr_fmt(fmt) "erofs: " fmt + +#define errln(x, ...) pr_err(x "\n", ##__VA_ARGS__) +#define infoln(x, ...) pr_info(x "\n", ##__VA_ARGS__) +#ifdef CONFIG_EROFS_FS_DEBUG +#define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__) + +#define dbg_might_sleep might_sleep +#define DBG_BUGON BUG_ON +#else +#define debugln(x, ...) ((void)0) + +#define dbg_might_sleep() ((void)0) +#define DBG_BUGON(...) ((void)0) +#endif + +#ifdef CONFIG_EROFS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_MAX, +}; + +extern char *erofs_fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) + +struct erofs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; +#endif + +#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR +#define EROFS_FS_ZIP_CACHE_LVL (2) +#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR) +#define EROFS_FS_ZIP_CACHE_LVL (1) +#else +#define EROFS_FS_ZIP_CACHE_LVL (0) +#endif + +#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0)) +#define EROFS_FS_HAS_MANAGED_CACHE +#endif + +/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ +#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 + +typedef u64 erofs_nid_t; + +struct erofs_sb_info { + /* list for all registered superblocks, mainly for shrinker */ + struct list_head list; + struct mutex umount_mutex; + + u32 blocks; + u32 meta_blkaddr; +#ifdef CONFIG_EROFS_FS_XATTR + u32 xattr_blkaddr; +#endif + + /* inode slot unit size in bit shift */ + unsigned char islotbits; +#ifdef CONFIG_EROFS_FS_ZIP + /* cluster size in bit shift */ + unsigned char clusterbits; + + /* the dedicated workstation for compression */ + struct radix_tree_root workstn_tree; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct inode *managed_cache; +#endif + +#endif + + u32 build_time_nsec; + u64 build_time; + + /* what we really care is nid, rather than ino.. */ + erofs_nid_t root_nid; + /* used for statfs, f_files - f_favail */ + u64 inos; + + u8 uuid[16]; /* 128-bit uuid for volume */ + u8 volume_name[16]; /* volume name */ + char *dev_name; + + unsigned int mount_opt; + unsigned int shrinker_run_no; + +#ifdef CONFIG_EROFS_FAULT_INJECTION + struct erofs_fault_info fault_info; /* For fault injection */ +#endif +}; + +#ifdef CONFIG_EROFS_FAULT_INJECTION +#define erofs_show_injection_info(type) \ + infoln("inject %s in %s of %pS", erofs_fault_name[type], \ + __func__, __builtin_return_address(0)) + +static inline bool time_to_inject(struct erofs_sb_info *sbi, int type) +{ + struct erofs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + return true; + } + return false; +} +#endif + +static inline void *erofs_kmalloc(struct erofs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_EROFS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KMALLOC)) { + erofs_show_injection_info(FAULT_KMALLOC); + return NULL; + } +#endif + return kmalloc(size, flags); +} + +#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) +#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) + +/* Mount flags set via mount options or defaults */ +#define EROFS_MOUNT_XATTR_USER 0x00000010 +#define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_FAULT_INJECTION 0x00000040 + +#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option) + +#ifdef CONFIG_EROFS_FS_ZIP +#define erofs_workstn_lock(sbi) xa_lock(&(sbi)->workstn_tree) +#define erofs_workstn_unlock(sbi) xa_unlock(&(sbi)->workstn_tree) + +/* basic unit of the workstation of a super_block */ +struct erofs_workgroup { + /* the workgroup index in the workstation */ + pgoff_t index; + + /* overall workgroup reference count */ + atomic_t refcount; +}; + +#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) + +static inline bool erofs_workgroup_try_to_freeze( + struct erofs_workgroup *grp, int v) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + if (v != atomic_cmpxchg(&grp->refcount, + v, EROFS_LOCKED_MAGIC)) + return false; + preempt_disable(); +#else + preempt_disable(); + if (atomic_read(&grp->refcount) != v) { + preempt_enable(); + return false; + } +#endif + return true; +} + +static inline void erofs_workgroup_unfreeze( + struct erofs_workgroup *grp, int v) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + atomic_set(&grp->refcount, v); +#endif + preempt_enable(); +} + +static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt) +{ + const int locked = (int)EROFS_LOCKED_MAGIC; + int o; + +repeat: + o = atomic_read(&grp->refcount); + + /* spin if it is temporarily locked at the reclaim path */ + if (unlikely(o == locked)) { +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + do + cpu_relax(); + while (atomic_read(&grp->refcount) == locked); +#endif + goto repeat; + } + + if (unlikely(o <= 0)) + return -1; + + if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o)) + goto repeat; + + *ocnt = o; + return 0; +} + +#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) + +extern int erofs_workgroup_put(struct erofs_workgroup *grp); + +extern struct erofs_workgroup *erofs_find_workgroup( + struct super_block *sb, pgoff_t index, bool *tag); + +extern int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, bool tag); + +extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, bool cleanup); + +static inline void erofs_workstation_cleanup_all(struct super_block *sb) +{ + erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true); +} + +#ifdef EROFS_FS_HAS_MANAGED_CACHE +#define EROFS_UNALLOCATED_CACHED_PAGE ((void *)0x5F0EF00D) + +extern int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +extern int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page); +#endif + +#endif + +/* we strictly follow PAGE_SIZE and no buffer head yet */ +#define LOG_BLOCK_SIZE PAGE_SHIFT + +#undef LOG_SECTORS_PER_BLOCK +#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) + +#undef SECTORS_PER_BLOCK +#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) + +#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) + +#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) +#error erofs cannot be used in this platform +#endif + +#define ROOT_NID(sb) ((sb)->root_nid) + +#ifdef CONFIG_EROFS_FS_ZIP +/* hard limit of pages per compressed cluster */ +#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) + +/* page count of a compressed cluster */ +#define erofs_clusterpages(sbi) ((1 << (sbi)->clusterbits) / PAGE_SIZE) +#endif + +typedef u64 erofs_off_t; + +/* data type for filesystem-wide blocks number */ +typedef u32 erofs_blk_t; + +#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) +#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) +#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) + +static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); +} + +#define inode_set_inited_xattr(inode) (EROFS_V(inode)->flags |= 1) +#define inode_has_inited_xattr(inode) (EROFS_V(inode)->flags & 1) + +struct erofs_vnode { + erofs_nid_t nid; + unsigned int flags; + + unsigned char data_mapping_mode; + /* inline size in bytes */ + unsigned char inode_isize; + unsigned short xattr_isize; + + unsigned xattr_shared_count; + unsigned *xattr_shared_xattrs; + + erofs_blk_t raw_blkaddr; + + /* the corresponding vfs inode */ + struct inode vfs_inode; +}; + +#define EROFS_V(ptr) \ + container_of(ptr, struct erofs_vnode, vfs_inode) + +#define __inode_advise(x, bit, bits) \ + (((x) >> (bit)) & ((1 << (bits)) - 1)) + +#define __inode_version(advise) \ + __inode_advise(advise, EROFS_I_VERSION_BIT, \ + EROFS_I_VERSION_BITS) + +#define __inode_data_mapping(advise) \ + __inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\ + EROFS_I_DATA_MAPPING_BITS) + +static inline unsigned long inode_datablocks(struct inode *inode) +{ + /* since i_size cannot be changed */ + return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); +} + +static inline bool is_inode_layout_plain(struct inode *inode) +{ + return EROFS_V(inode)->data_mapping_mode == EROFS_INODE_LAYOUT_PLAIN; +} + +static inline bool is_inode_layout_compression(struct inode *inode) +{ + return EROFS_V(inode)->data_mapping_mode == + EROFS_INODE_LAYOUT_COMPRESSION; +} + +static inline bool is_inode_layout_inline(struct inode *inode) +{ + return EROFS_V(inode)->data_mapping_mode == EROFS_INODE_LAYOUT_INLINE; +} + +extern const struct super_operations erofs_sops; +extern const struct inode_operations erofs_dir_iops; +extern const struct file_operations erofs_dir_fops; + +extern const struct address_space_operations erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ZIP +extern const struct address_space_operations z_erofs_vle_normalaccess_aops; +#endif + +/* + * Logical to physical block mapping, used by erofs_map_blocks() + * + * Different with other file systems, it is used for 2 access modes: + * + * 1) RAW access mode: + * + * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, + * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). + * + * Note that m_lblk in the RAW access mode refers to the number of + * the compressed ondisk block rather than the uncompressed + * in-memory block for the compressed file. + * + * m_pofs equals to m_lofs except for the inline data page. + * + * 2) Normal access mode: + * + * If the inode is not compressed, it has no difference with + * the RAW access mode. However, if the inode is compressed, + * users should pass a valid (m_lblk, m_lofs) pair, and get + * the needed m_pblk, m_pofs, m_len to get the compressed data + * and the updated m_lblk, m_lofs which indicates the start + * of the corresponding uncompressed data in the file. + */ +enum { + BH_Zipped = BH_PrivateStart, +}; + +/* Has a disk mapping */ +#define EROFS_MAP_MAPPED (1 << BH_Mapped) +/* Located in metadata (could be copied from bd_inode) */ +#define EROFS_MAP_META (1 << BH_Meta) +/* The extent has been compressed */ +#define EROFS_MAP_ZIPPED (1 << BH_Zipped) + +struct erofs_map_blocks { + erofs_off_t m_pa, m_la; + u64 m_plen, m_llen; + + unsigned int m_flags; +}; + +/* Flags used by erofs_map_blocks() */ +#define EROFS_GET_BLOCKS_RAW 0x0001 + +/* data.c */ +static inline struct bio *prepare_bio( + struct super_block *sb, + erofs_blk_t blkaddr, unsigned nr_pages, + bio_end_io_t endio) +{ + gfp_t gfp = GFP_NOIO; + struct bio *bio = bio_alloc(gfp, nr_pages); + + if (unlikely(bio == NULL) && + (current->flags & PF_MEMALLOC)) { + do { + nr_pages /= 2; + if (unlikely(!nr_pages)) { + bio = bio_alloc(gfp | __GFP_NOFAIL, 1); + BUG_ON(bio == NULL); + break; + } + bio = bio_alloc(gfp, nr_pages); + } while (bio == NULL); + } + + bio->bi_end_io = endio; + bio_set_dev(bio, sb->s_bdev); + bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK; + return bio; +} + +static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags) +{ + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); +} + +extern struct page *erofs_get_meta_page(struct super_block *sb, + erofs_blk_t blkaddr, bool prio); +extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); +extern int erofs_map_blocks_iter(struct inode *, struct erofs_map_blocks *, + struct page **, int); + +struct erofs_map_blocks_iter { + struct erofs_map_blocks map; + struct page *mpage; +}; + + +static inline struct page *erofs_get_inline_page(struct inode *inode, + erofs_blk_t blkaddr) +{ + return erofs_get_meta_page(inode->i_sb, + blkaddr, S_ISDIR(inode->i_mode)); +} + +/* inode.c */ +extern struct inode *erofs_iget(struct super_block *sb, + erofs_nid_t nid, bool dir); + +/* dir.c */ +int erofs_namei(struct inode *dir, struct qstr *name, + erofs_nid_t *nid, unsigned *d_type); + +/* xattr.c */ +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct xattr_handler *erofs_xattr_handlers[]; +#endif + +/* symlink */ +#ifdef CONFIG_EROFS_FS_XATTR +extern const struct inode_operations erofs_symlink_xattr_iops; +extern const struct inode_operations erofs_fast_symlink_xattr_iops; +extern const struct inode_operations erofs_special_inode_operations; +#endif + +static inline void set_inode_fast_symlink(struct inode *inode) +{ +#ifdef CONFIG_EROFS_FS_XATTR + inode->i_op = &erofs_fast_symlink_xattr_iops; +#else + inode->i_op = &simple_symlink_inode_operations; +#endif +} + +static inline bool is_inode_fast_symlink(struct inode *inode) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return inode->i_op == &erofs_fast_symlink_xattr_iops; +#else + return inode->i_op == &simple_symlink_inode_operations; +#endif +} + +static inline void *erofs_vmap(struct page **pages, unsigned int count) +{ +#ifdef CONFIG_EROFS_FS_USE_VM_MAP_RAM + int i = 0; + + while (1) { + void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL); + /* retry two more times (totally 3 times) */ + if (addr != NULL || ++i >= 3) + return addr; + vm_unmap_aliases(); + } + return NULL; +#else + return vmap(pages, count, VM_MAP, PAGE_KERNEL); +#endif +} + +static inline void erofs_vunmap(const void *mem, unsigned int count) +{ +#ifdef CONFIG_EROFS_FS_USE_VM_MAP_RAM + vm_unmap_ram(mem, count); +#else + vunmap(mem); +#endif +} + +/* utils.c */ +extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); + +extern void erofs_register_super(struct super_block *sb); +extern void erofs_unregister_super(struct super_block *sb); + +extern unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); + +#ifndef lru_to_page +#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +#endif + +#endif + diff --git a/drivers/staging/erofs/lz4defs.h b/drivers/staging/erofs/lz4defs.h new file mode 100644 index 000000000000..00a0b58a0871 --- /dev/null +++ b/drivers/staging/erofs/lz4defs.h @@ -0,0 +1,227 @@ +#ifndef __LZ4DEFS_H__ +#define __LZ4DEFS_H__ + +/* + * lz4defs.h -- common and architecture specific defines for the kernel usage + + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2016, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 + * + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> + */ + +#include <asm/unaligned.h> +#include <linux/string.h> /* memset, memcpy */ + +#define FORCE_INLINE __always_inline + +/*-************************************ + * Basic Types + **************************************/ +#include <linux/types.h> + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +typedef uintptr_t uptrval; + +/*-************************************ + * Architecture specifics + **************************************/ +#if defined(CONFIG_64BIT) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +#if defined(__LITTLE_ENDIAN) +#define LZ4_LITTLE_ENDIAN 1 +#else +#define LZ4_LITTLE_ENDIAN 0 +#endif + +/*-************************************ + * Constants + **************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (WILDCOPYLENGTH + MINMATCH) + +/* Increase this value ==> compression run slower on incompressible data */ +#define LZ4_SKIPTRIGGER 6 + +#define HASH_UNIT sizeof(size_t) + +#define KB (1 << 10) +#define MB (1 << 20) +#define GB (1U << 30) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) +#define STEPSIZE sizeof(size_t) + +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) + +/*-************************************ + * Reading and writing into memory + **************************************/ +static FORCE_INLINE U16 LZ4_read16(const void *ptr) +{ + return get_unaligned((const U16 *)ptr); +} + +static FORCE_INLINE U32 LZ4_read32(const void *ptr) +{ + return get_unaligned((const U32 *)ptr); +} + +static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) +{ + return get_unaligned((const size_t *)ptr); +} + +static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) +{ + put_unaligned(value, (U16 *)memPtr); +} + +static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) +{ + put_unaligned(value, (U32 *)memPtr); +} + +static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) +{ + return get_unaligned_le16(memPtr); +} + +static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) +{ + return put_unaligned_le16(value, memPtr); +} + +static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) +{ +#if LZ4_ARCH64 + U64 a = get_unaligned((const U64 *)src); + + put_unaligned(a, (U64 *)dst); +#else + U32 a = get_unaligned((const U32 *)src); + U32 b = get_unaligned((const U32 *)src + 1); + + put_unaligned(a, (U32 *)dst); + put_unaligned(b, (U32 *)dst + 1); +#endif +} + +/* + * customized variant of memcpy, + * which can overwrite up to 7 bytes beyond dstEnd + */ +static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, + const void *srcPtr, void *dstEnd) +{ + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE *const e = (BYTE *)dstEnd; + + do { + LZ4_copy8(d, s); + d += 8; + s += 8; + } while (d < e); +} + +static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) +{ +#if LZ4_LITTLE_ENDIAN + return __ffs(val) >> 3; +#else + return (BITS_PER_LONG - 1 - __fls(val)) >> 3; +#endif +} + +static FORCE_INLINE unsigned int LZ4_count( + const BYTE *pIn, + const BYTE *pMatch, + const BYTE *pInLimit) +{ + const BYTE *const pStart = pIn; + + while (likely(pIn < pInLimit - (STEPSIZE - 1))) { + size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + + if (!diff) { + pIn += STEPSIZE; + pMatch += STEPSIZE; + continue; + } + + pIn += LZ4_NbCommonBytes(diff); + + return (unsigned int)(pIn - pStart); + } + +#if LZ4_ARCH64 + if ((pIn < (pInLimit - 3)) + && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { + pIn += 4; + pMatch += 4; + } +#endif + + if ((pIn < (pInLimit - 1)) + && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { + pIn += 2; + pMatch += 2; + } + + if ((pIn < pInLimit) && (*pMatch == *pIn)) + pIn++; + + return (unsigned int)(pIn - pStart); +} + +typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; + +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; + +#endif diff --git a/drivers/staging/erofs/namei.c b/drivers/staging/erofs/namei.c new file mode 100644 index 000000000000..546a47156101 --- /dev/null +++ b/drivers/staging/erofs/namei.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/namei.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "internal.h" +#include "xattr.h" + +#include <trace/events/erofs.h> + +/* based on the value of qn->len is accurate */ +static inline int dirnamecmp(struct qstr *qn, + struct qstr *qd, unsigned *matched) +{ + unsigned i = *matched, len = min(qn->len, qd->len); +loop: + if (unlikely(i >= len)) { + *matched = i; + if (qn->len < qd->len) { + /* + * actually (qn->len == qd->len) + * when qd->name[i] == '\0' + */ + return qd->name[i] == '\0' ? 0 : -1; + } + return (qn->len > qd->len); + } + + if (qn->name[i] != qd->name[i]) { + *matched = i; + return qn->name[i] > qd->name[i] ? 1 : -1; + } + + ++i; + goto loop; +} + +static struct erofs_dirent *find_target_dirent( + struct qstr *name, + u8 *data, int maxsize) +{ + unsigned ndirents, head, back; + unsigned startprfx, endprfx; + struct erofs_dirent *const de = (struct erofs_dirent *)data; + + /* make sure that maxsize is valid */ + BUG_ON(maxsize < sizeof(struct erofs_dirent)); + + ndirents = le16_to_cpu(de->nameoff) / sizeof(*de); + + /* corrupted dir (may be unnecessary...) */ + BUG_ON(!ndirents); + + head = 0; + back = ndirents - 1; + startprfx = endprfx = 0; + + while (head <= back) { + unsigned mid = head + (back - head) / 2; + unsigned nameoff = le16_to_cpu(de[mid].nameoff); + unsigned matched = min(startprfx, endprfx); + + struct qstr dname = QSTR_INIT(data + nameoff, + unlikely(mid >= ndirents - 1) ? + maxsize - nameoff : + le16_to_cpu(de[mid + 1].nameoff) - nameoff); + + /* string comparison without already matched prefix */ + int ret = dirnamecmp(name, &dname, &matched); + + if (unlikely(!ret)) + return de + mid; + else if (ret > 0) { + head = mid + 1; + startprfx = matched; + } else if (unlikely(mid < 1)) /* fix "mid" overflow */ + break; + else { + back = mid - 1; + endprfx = matched; + } + } + + return ERR_PTR(-ENOENT); +} + +static struct page *find_target_block_classic( + struct inode *dir, + struct qstr *name, int *_diff) +{ + unsigned startprfx, endprfx; + unsigned head, back; + struct address_space *const mapping = dir->i_mapping; + struct page *candidate = ERR_PTR(-ENOENT); + + startprfx = endprfx = 0; + head = 0; + back = inode_datablocks(dir) - 1; + + while (head <= back) { + unsigned mid = head + (back - head) / 2; + struct page *page = read_mapping_page(mapping, mid, NULL); + + if (IS_ERR(page)) { +exact_out: + if (!IS_ERR(candidate)) /* valid candidate */ + put_page(candidate); + return page; + } else { + int diff; + unsigned ndirents, matched; + struct qstr dname; + struct erofs_dirent *de = kmap_atomic(page); + unsigned nameoff = le16_to_cpu(de->nameoff); + + ndirents = nameoff / sizeof(*de); + + /* corrupted dir (should have one entry at least) */ + BUG_ON(!ndirents || nameoff > PAGE_SIZE); + + matched = min(startprfx, endprfx); + + dname.name = (u8 *)de + nameoff; + dname.len = ndirents == 1 ? + /* since the rest of the last page is 0 */ + EROFS_BLKSIZ - nameoff + : le16_to_cpu(de[1].nameoff) - nameoff; + + /* string comparison without already matched prefix */ + diff = dirnamecmp(name, &dname, &matched); + kunmap_atomic(de); + + if (unlikely(!diff)) { + *_diff = 0; + goto exact_out; + } else if (diff > 0) { + head = mid + 1; + startprfx = matched; + + if (likely(!IS_ERR(candidate))) + put_page(candidate); + candidate = page; + } else { + put_page(page); + + if (unlikely(mid < 1)) /* fix "mid" overflow */ + break; + + back = mid - 1; + endprfx = matched; + } + } + } + *_diff = 1; + return candidate; +} + +int erofs_namei(struct inode *dir, + struct qstr *name, + erofs_nid_t *nid, unsigned *d_type) +{ + int diff; + struct page *page; + u8 *data; + struct erofs_dirent *de; + + if (unlikely(!dir->i_size)) + return -ENOENT; + + diff = 1; + page = find_target_block_classic(dir, name, &diff); + + if (unlikely(IS_ERR(page))) + return PTR_ERR(page); + + data = kmap_atomic(page); + /* the target page has been mapped */ + de = likely(diff) ? + /* since the rest of the last page is 0 */ + find_target_dirent(name, data, EROFS_BLKSIZ) : + (struct erofs_dirent *)data; + + if (likely(!IS_ERR(de))) { + *nid = le64_to_cpu(de->nid); + *d_type = de->file_type; + } + + kunmap_atomic(data); + put_page(page); + + return PTR_ERR_OR_ZERO(de); +} + +/* NOTE: i_mutex is already held by vfs */ +static struct dentry *erofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + int err; + erofs_nid_t nid; + unsigned d_type; + struct inode *inode; + + DBG_BUGON(!d_really_is_negative(dentry)); + /* dentry must be unhashed in lookup, no need to worry about */ + DBG_BUGON(!d_unhashed(dentry)); + + trace_erofs_lookup(dir, dentry, flags); + + /* file name exceeds fs limit */ + if (unlikely(dentry->d_name.len > EROFS_NAME_LEN)) + return ERR_PTR(-ENAMETOOLONG); + + /* false uninitialized warnings on gcc 4.8.x */ + err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); + + if (err == -ENOENT) { + /* negative dentry */ + inode = NULL; + goto negative_out; + } else if (unlikely(err)) + return ERR_PTR(err); + + debugln("%s, %s (nid %llu) found, d_type %u", __func__, + dentry->d_name.name, nid, d_type); + + inode = erofs_iget(dir->i_sb, nid, d_type == EROFS_FT_DIR); + if (IS_ERR(inode)) + return ERR_CAST(inode); + +negative_out: + return d_splice_alias(inode, dentry); +} + +const struct inode_operations erofs_dir_iops = { + .lookup = erofs_lookup, +}; + +const struct inode_operations erofs_dir_xattr_iops = { + .lookup = erofs_lookup, +#ifdef CONFIG_EROFS_FS_XATTR + .listxattr = erofs_listxattr, +#endif +}; + diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c new file mode 100644 index 000000000000..1aec509c805f --- /dev/null +++ b/drivers/staging/erofs/super.c @@ -0,0 +1,649 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/super.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/statfs.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/erofs.h> + +static struct kmem_cache *erofs_inode_cachep __read_mostly; + +static void init_once(void *ptr) +{ + struct erofs_vnode *vi = ptr; + + inode_init_once(&vi->vfs_inode); +} + +static int erofs_init_inode_cache(void) +{ + erofs_inode_cachep = kmem_cache_create("erofs_inode", + sizeof(struct erofs_vnode), 0, + SLAB_RECLAIM_ACCOUNT, init_once); + + return erofs_inode_cachep != NULL ? 0 : -ENOMEM; +} + +static void erofs_exit_inode_cache(void) +{ + BUG_ON(erofs_inode_cachep == NULL); + kmem_cache_destroy(erofs_inode_cachep); +} + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct erofs_vnode *vi = + kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); + + if (vi == NULL) + return NULL; + + /* zero out everything except vfs_inode */ + memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode)); + return &vi->vfs_inode; +} + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct erofs_vnode *vi = EROFS_V(inode); + + /* be careful RCU symlink path (see ext4_inode_info->i_data)! */ + if (is_inode_fast_symlink(inode)) + kfree(inode->i_link); + + kfree(vi->xattr_shared_xattrs); + + kmem_cache_free(erofs_inode_cachep, vi); +} + +static void destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, i_callback); +} + +static int superblock_read(struct super_block *sb) +{ + struct erofs_sb_info *sbi; + struct buffer_head *bh; + struct erofs_super_block *layout; + unsigned blkszbits; + int ret; + + bh = sb_bread(sb, 0); + + if (bh == NULL) { + errln("cannot read erofs superblock"); + return -EIO; + } + + sbi = EROFS_SB(sb); + layout = (struct erofs_super_block *)((u8 *)bh->b_data + + EROFS_SUPER_OFFSET); + + ret = -EINVAL; + if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) { + errln("cannot find valid erofs superblock"); + goto out; + } + + blkszbits = layout->blkszbits; + /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ + if (unlikely(blkszbits != LOG_BLOCK_SIZE)) { + errln("blksize %u isn't supported on this platform", + 1 << blkszbits); + goto out; + } + + sbi->blocks = le32_to_cpu(layout->blocks); + sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr); +#ifdef CONFIG_EROFS_FS_XATTR + sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr); +#endif + sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1; +#ifdef CONFIG_EROFS_FS_ZIP + sbi->clusterbits = 12; + + if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES) + errln("clusterbits %u is not supported on this kernel", + sbi->clusterbits); +#endif + + sbi->root_nid = le16_to_cpu(layout->root_nid); + sbi->inos = le64_to_cpu(layout->inos); + + sbi->build_time = le64_to_cpu(layout->build_time); + sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec); + + memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid)); + memcpy(sbi->volume_name, layout->volume_name, + sizeof(layout->volume_name)); + + ret = 0; +out: + brelse(bh); + return ret; +} + +#ifdef CONFIG_EROFS_FAULT_INJECTION +char *erofs_fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", +}; + +static void erofs_build_fault_attr(struct erofs_sb_info *sbi, + unsigned int rate) +{ + struct erofs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct erofs_fault_info)); + } +} +#endif + +static void default_options(struct erofs_sb_info *sbi) +{ +#ifdef CONFIG_EROFS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif + +#ifdef CONFIG_EROFS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif +} + +enum { + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_fault_injection, + Opt_err +}; + +static match_table_t erofs_tokens = { + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(struct super_block *sb, char *options) +{ + substring_t args[MAX_OPT_ARGS]; + char *p; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, erofs_tokens, args); + + switch (token) { +#ifdef CONFIG_EROFS_FS_XATTR + case Opt_user_xattr: + set_opt(EROFS_SB(sb), XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(EROFS_SB(sb), XATTR_USER); + break; +#else + case Opt_user_xattr: + infoln("user_xattr options not supported"); + break; + case Opt_nouser_xattr: + infoln("nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + case Opt_acl: + set_opt(EROFS_SB(sb), POSIX_ACL); + break; + case Opt_noacl: + clear_opt(EROFS_SB(sb), POSIX_ACL); + break; +#else + case Opt_acl: + infoln("acl options not supported"); + break; + case Opt_noacl: + infoln("noacl options not supported"); + break; +#endif + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_EROFS_FAULT_INJECTION + erofs_build_fault_attr(EROFS_SB(sb), arg); + set_opt(EROFS_SB(sb), FAULT_INJECTION); +#else + infoln("FAULT_INJECTION was not selected"); +#endif + break; + default: + errln("Unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; + } + } + return 0; +} + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + +static const struct address_space_operations managed_cache_aops; + +static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask) +{ + int ret = 1; /* 0 - busy */ + struct address_space *const mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + BUG_ON(mapping->a_ops != &managed_cache_aops); + + if (PagePrivate(page)) + ret = erofs_try_to_free_cached_page(mapping, page); + + return ret; +} + +static void managed_cache_invalidatepage(struct page *page, + unsigned int offset, unsigned int length) +{ + const unsigned int stop = length + offset; + + BUG_ON(!PageLocked(page)); + + /* Check for overflow */ + BUG_ON(stop > PAGE_SIZE || stop < length); + + if (offset == 0 && stop == PAGE_SIZE) + while (!managed_cache_releasepage(page, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations managed_cache_aops = { + .releasepage = managed_cache_releasepage, + .invalidatepage = managed_cache_invalidatepage, +}; + +static struct inode *erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + + if (unlikely(inode == NULL)) + return ERR_PTR(-ENOMEM); + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + + inode->i_mapping->a_ops = &managed_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | + __GFP_MOVABLE | __GFP_NOFAIL); + return inode; +} + +#endif + +static int erofs_read_super(struct super_block *sb, + const char *dev_name, void *data, int silent) +{ + struct inode *inode; + struct erofs_sb_info *sbi; + int err = -EINVAL; + + infoln("read_super, device -> %s", dev_name); + infoln("options -> %s", (char *)data); + + if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) { + errln("failed to set erofs blksize"); + goto err; + } + + sbi = kzalloc(sizeof(struct erofs_sb_info), GFP_KERNEL); + if (unlikely(sbi == NULL)) { + err = -ENOMEM; + goto err; + } + sb->s_fs_info = sbi; + + err = superblock_read(sb); + if (err) + goto err_sbread; + + sb->s_magic = EROFS_SUPER_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + + sb->s_op = &erofs_sops; + +#ifdef CONFIG_EROFS_FS_XATTR + sb->s_xattr = erofs_xattr_handlers; +#endif + + /* set erofs default mount options */ + default_options(sbi); + + err = parse_options(sb, data); + if (err) + goto err_parseopt; + + if (!silent) + infoln("root inode @ nid %llu", ROOT_NID(sbi)); + +#ifdef CONFIG_EROFS_FS_ZIP + INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); +#endif + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + sbi->managed_cache = erofs_init_managed_cache(sb); + if (IS_ERR(sbi->managed_cache)) { + err = PTR_ERR(sbi->managed_cache); + goto err_init_managed_cache; + } +#endif + + /* get the root inode */ + inode = erofs_iget(sb, ROOT_NID(sbi), true); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_iget; + } + + if (!S_ISDIR(inode->i_mode)) { + errln("rootino(nid %llu) is not a directory(i_mode %o)", + ROOT_NID(sbi), inode->i_mode); + err = -EINVAL; + goto err_isdir; + } + + sb->s_root = d_make_root(inode); + if (sb->s_root == NULL) { + err = -ENOMEM; + goto err_makeroot; + } + + /* save the device name to sbi */ + sbi->dev_name = __getname(); + if (sbi->dev_name == NULL) { + err = -ENOMEM; + goto err_devname; + } + + snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name); + sbi->dev_name[PATH_MAX - 1] = '\0'; + + erofs_register_super(sb); + + /* + * We already have a positive dentry, which was instantiated + * by d_make_root. Just need to d_rehash it. + */ + d_rehash(sb->s_root); + + if (!silent) + infoln("mounted on %s with opts: %s.", dev_name, + (char *)data); + return 0; + /* + * please add a label for each exit point and use + * the following name convention, thus new features + * can be integrated easily without renaming labels. + */ +err_devname: + dput(sb->s_root); +err_makeroot: +err_isdir: + if (sb->s_root == NULL) + iput(inode); +err_iget: +#ifdef EROFS_FS_HAS_MANAGED_CACHE + iput(sbi->managed_cache); +err_init_managed_cache: +#endif +err_parseopt: +err_sbread: + sb->s_fs_info = NULL; + kfree(sbi); +err: + return err; +} + +/* + * could be triggered after deactivate_locked_super() + * is called, thus including umount and failed to initialize. + */ +static void erofs_put_super(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + /* for cases which are failed in "read_super" */ + if (sbi == NULL) + return; + + WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + + infoln("unmounted for %s", sbi->dev_name); + __putname(sbi->dev_name); + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + iput(sbi->managed_cache); +#endif + + mutex_lock(&sbi->umount_mutex); + +#ifdef CONFIG_EROFS_FS_ZIP + erofs_workstation_cleanup_all(sb); +#endif + + erofs_unregister_super(sb); + mutex_unlock(&sbi->umount_mutex); + + kfree(sbi); + sb->s_fs_info = NULL; +} + + +struct erofs_mount_private { + const char *dev_name; + char *options; +}; + +/* support mount_bdev() with options */ +static int erofs_fill_super(struct super_block *sb, + void *_priv, int silent) +{ + struct erofs_mount_private *priv = _priv; + + return erofs_read_super(sb, priv->dev_name, + priv->options, silent); +} + +static struct dentry *erofs_mount( + struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct erofs_mount_private priv = { + .dev_name = dev_name, + .options = data + }; + + return mount_bdev(fs_type, flags, dev_name, + &priv, erofs_fill_super); +} + +static void erofs_kill_sb(struct super_block *sb) +{ + kill_block_super(sb); +} + +static struct shrinker erofs_shrinker_info = { + .scan_objects = erofs_shrink_scan, + .count_objects = erofs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +static struct file_system_type erofs_fs_type = { + .owner = THIS_MODULE, + .name = "erofs", + .mount = erofs_mount, + .kill_sb = erofs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("erofs"); + +#ifdef CONFIG_EROFS_FS_ZIP +extern int z_erofs_init_zip_subsystem(void); +extern void z_erofs_exit_zip_subsystem(void); +#endif + +static int __init erofs_module_init(void) +{ + int err; + + erofs_check_ondisk_layout_definitions(); + infoln("initializing erofs " EROFS_VERSION); + + err = erofs_init_inode_cache(); + if (err) + goto icache_err; + + err = register_shrinker(&erofs_shrinker_info); + if (err) + goto shrinker_err; + +#ifdef CONFIG_EROFS_FS_ZIP + err = z_erofs_init_zip_subsystem(); + if (err) + goto zip_err; +#endif + + err = register_filesystem(&erofs_fs_type); + if (err) + goto fs_err; + + infoln("successfully to initialize erofs"); + return 0; + +fs_err: +#ifdef CONFIG_EROFS_FS_ZIP + z_erofs_exit_zip_subsystem(); +zip_err: +#endif + unregister_shrinker(&erofs_shrinker_info); +shrinker_err: + erofs_exit_inode_cache(); +icache_err: + return err; +} + +static void __exit erofs_module_exit(void) +{ + unregister_filesystem(&erofs_fs_type); +#ifdef CONFIG_EROFS_FS_ZIP + z_erofs_exit_zip_subsystem(); +#endif + unregister_shrinker(&erofs_shrinker_info); + erofs_exit_inode_cache(); + infoln("successfully finalize erofs"); +} + +/* get filesystem statistics */ +static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = sb->s_magic; + buf->f_bsize = EROFS_BLKSIZ; + buf->f_blocks = sbi->blocks; + buf->f_bfree = buf->f_bavail = 0; + + buf->f_files = ULLONG_MAX; + buf->f_ffree = ULLONG_MAX - sbi->inos; + + buf->f_namelen = EROFS_NAME_LEN; + + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +static int erofs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); + +#ifdef CONFIG_EROFS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); +#endif +#ifdef CONFIG_EROFS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif +#ifdef CONFIG_EROFS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif + return 0; +} + +static int erofs_remount(struct super_block *sb, int *flags, char *data) +{ + BUG_ON(!sb_rdonly(sb)); + + *flags |= MS_RDONLY; + return 0; +} + +const struct super_operations erofs_sops = { + .put_super = erofs_put_super, + .alloc_inode = alloc_inode, + .destroy_inode = destroy_inode, + .statfs = erofs_statfs, + .show_options = erofs_show_options, + .remount_fs = erofs_remount, +}; + +module_init(erofs_module_init); +module_exit(erofs_module_exit); + +MODULE_DESCRIPTION("Enhanced ROM File System"); +MODULE_AUTHOR("Gao Xiang, Yu Chao, Miao Xie, CONSUMER BG, HUAWEI Inc."); +MODULE_LICENSE("GPL"); + diff --git a/drivers/staging/erofs/unzip_lz4.c b/drivers/staging/erofs/unzip_lz4.c new file mode 100644 index 000000000000..b1ea23f66c4e --- /dev/null +++ b/drivers/staging/erofs/unzip_lz4.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * linux/drivers/staging/erofs/unzip_lz4.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * Original code taken from 'linux/lib/lz4/lz4_decompress.c' + */ + +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011 - 2016, Yann Collet. + * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 + * + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> + */ +#include "internal.h" +#include <asm/unaligned.h> +#include "lz4defs.h" + +/* + * no public solution to solve our requirement yet. + * see: <required buffer size for LZ4_decompress_safe_partial> + * https://groups.google.com/forum/#!topic/lz4c/_3kkz5N6n00 + */ +static FORCE_INLINE int customized_lz4_decompress_safe_partial( + const void * const source, + void * const dest, + int inputSize, + int outputSize) +{ + /* Local Variables */ + const BYTE *ip = (const BYTE *) source; + const BYTE * const iend = ip + inputSize; + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + outputSize; + BYTE *cpy; + + static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; + static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; + + /* Empty output buffer */ + if (unlikely(outputSize == 0)) + return ((inputSize == 1) && (*ip == 0)) ? 0 : -1; + + /* Main Loop : decode sequences */ + while (1) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned int const token = *ip++; + + length = token>>ML_BITS; + + if (length == RUN_MASK) { + unsigned int s; + + do { + s = *ip++; + length += s; + } while ((ip < iend - RUN_MASK) & (s == 255)); + + if (unlikely((size_t)(op + length) < (size_t)(op))) { + /* overflow detection */ + goto _output_error; + } + if (unlikely((size_t)(ip + length) < (size_t)(ip))) { + /* overflow detection */ + goto _output_error; + } + } + + /* copy literals */ + cpy = op + length; + if ((cpy > oend - WILDCOPYLENGTH) || + (ip + length > iend - (2 + 1 + LASTLITERALS))) { + if (cpy > oend) { + memcpy(op, ip, length = oend - op); + op += length; + break; + } + + if (unlikely(ip + length > iend)) { + /* + * Error : + * read attempt beyond + * end of input buffer + */ + goto _output_error; + } + + memcpy(op, ip, length); + ip += length; + op += length; + + if (ip > iend - 2) + break; + /* Necessarily EOF, due to parsing restrictions */ + /* break; */ + } else { + LZ4_wildCopy(op, ip, cpy); + ip += length; + op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + + if (unlikely(match < (const BYTE *)dest)) { + /* Error : offset outside buffers */ + goto _output_error; + } + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned int s; + + do { + s = *ip++; + + if (ip > iend - LASTLITERALS) + goto _output_error; + + length += s; + } while (s == 255); + + if (unlikely((size_t)(op + length) < (size_t)op)) { + /* overflow detection */ + goto _output_error; + } + } + + length += MINMATCH; + + /* copy match within block */ + cpy = op + length; + + if (unlikely(cpy >= oend - WILDCOPYLENGTH)) { + if (cpy >= oend) { + while (op < oend) + *op++ = *match++; + break; + } + goto __match; + } + + /* costs ~1%; silence an msan warning when offset == 0 */ + LZ4_write32(op, (U32)offset); + + if (unlikely(offset < 8)) { + const int dec64 = dec64table[offset]; + + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[offset]; + memcpy(op + 4, match, 4); + match -= dec64; + } else { + LZ4_copy8(op, match); + match += 8; + } + + op += 8; + + if (unlikely(cpy > oend - 12)) { + BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1); + + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } +__match: + while (op < cpy) + *op++ = *match++; + } else { + LZ4_copy8(op, match); + + if (length > 16) + LZ4_wildCopy(op + 8, match + 8, cpy); + } + + op = cpy; /* correction */ + } + DBG_BUGON((void *)ip - source > inputSize); + DBG_BUGON((void *)op - dest > outputSize); + + /* Nb of output bytes decoded */ + return (int) ((void *)op - dest); + + /* Overflow error detected */ +_output_error: + return -ERANGE; +} + +int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen) +{ + int ret = customized_lz4_decompress_safe_partial(in, + out, inlen, outlen); + + if (ret >= 0) + return ret; + + /* + * LZ4_decompress_safe will return an error code + * (< 0) if decompression failed + */ + errln("%s, failed to decompress, in[%p, %zu] outlen[%p, %zu]", + __func__, in, inlen, out, outlen); + WARN_ON(1); + print_hex_dump(KERN_DEBUG, "raw data [in]: ", DUMP_PREFIX_OFFSET, + 16, 1, in, inlen, true); + print_hex_dump(KERN_DEBUG, "raw data [out]: ", DUMP_PREFIX_OFFSET, + 16, 1, out, outlen, true); + return -EIO; +} + diff --git a/drivers/staging/erofs/unzip_pagevec.h b/drivers/staging/erofs/unzip_pagevec.h new file mode 100644 index 000000000000..0956615b86f7 --- /dev/null +++ b/drivers/staging/erofs/unzip_pagevec.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/unzip_pagevec.h + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __EROFS_UNZIP_PAGEVEC_H +#define __EROFS_UNZIP_PAGEVEC_H + +#include <linux/tagptr.h> + +/* page type in pagevec for unzip subsystem */ +enum z_erofs_page_type { + /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ + Z_EROFS_PAGE_TYPE_EXCLUSIVE, + + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, + + Z_EROFS_VLE_PAGE_TYPE_HEAD, + Z_EROFS_VLE_PAGE_TYPE_MAX +}; + +extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") + __bad_page_type_exclusive(void); + +/* pagevec tagged pointer */ +typedef tagptr2_t erofs_vtptr_t; + +/* pagevec collector */ +struct z_erofs_pagevec_ctor { + struct page *curr, *next; + erofs_vtptr_t *pages; + + unsigned int nr, index; +}; + +static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + if (ctor->curr == NULL) + return; + + if (atomic) + kunmap_atomic(ctor->pages); + else + kunmap(ctor->curr); +} + +static inline struct page * +z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, + unsigned nr) +{ + unsigned index; + + /* keep away from occupied pages */ + if (ctor->next != NULL) + return ctor->next; + + for (index = 0; index < nr; ++index) { + const erofs_vtptr_t t = ctor->pages[index]; + const unsigned tags = tagptr_unfold_tags(t); + + if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) + return tagptr_unfold_ptr(t); + } + + if (unlikely(nr >= ctor->nr)) + BUG(); + + return NULL; +} + +static inline void +z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, + bool atomic) +{ + struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); + + z_erofs_pagevec_ctor_exit(ctor, atomic); + + ctor->curr = next; + ctor->next = NULL; + ctor->pages = atomic ? + kmap_atomic(ctor->curr) : kmap(ctor->curr); + + ctor->nr = PAGE_SIZE / sizeof(struct page *); + ctor->index = 0; +} + +static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, + unsigned nr, + erofs_vtptr_t *pages, unsigned i) +{ + ctor->nr = nr; + ctor->curr = ctor->next = NULL; + ctor->pages = pages; + + if (i >= nr) { + i -= nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + while (i > ctor->nr) { + i -= ctor->nr; + z_erofs_pagevec_ctor_pagedown(ctor, false); + } + } + + ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); + ctor->index = i; +} + +static inline bool +z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor, + struct page *page, + enum z_erofs_page_type type, + bool *occupied) +{ + *occupied = false; + if (unlikely(ctor->next == NULL && type)) + if (ctor->index + 1 == ctor->nr) + return false; + + if (unlikely(ctor->index >= ctor->nr)) + z_erofs_pagevec_ctor_pagedown(ctor, false); + + /* exclusive page type must be 0 */ + if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) + __bad_page_type_exclusive(); + + /* should remind that collector->next never equal to 1, 2 */ + if (type == (uintptr_t)ctor->next) { + ctor->next = page; + *occupied = true; + } + + ctor->pages[ctor->index++] = + tagptr_fold(erofs_vtptr_t, page, type); + return true; +} + +static inline struct page * +z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor, + enum z_erofs_page_type *type) +{ + erofs_vtptr_t t; + + if (unlikely(ctor->index >= ctor->nr)) { + BUG_ON(ctor->next == NULL); + z_erofs_pagevec_ctor_pagedown(ctor, true); + } + + t = ctor->pages[ctor->index]; + + *type = tagptr_unfold_tags(t); + + /* should remind that collector->next never equal to 1, 2 */ + if (*type == (uintptr_t)ctor->next) + ctor->next = tagptr_unfold_ptr(t); + + ctor->pages[ctor->index++] = + tagptr_fold(erofs_vtptr_t, NULL, 0); + + return tagptr_unfold_ptr(t); +} + +#endif + diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c new file mode 100644 index 000000000000..8721f0a41d15 --- /dev/null +++ b/drivers/staging/erofs/unzip_vle.c @@ -0,0 +1,1656 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/unzip_vle.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "unzip_vle.h" +#include <linux/prefetch.h> + +static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly; + +void z_erofs_exit_zip_subsystem(void) +{ + BUG_ON(z_erofs_workqueue == NULL); + BUG_ON(z_erofs_workgroup_cachep == NULL); + + destroy_workqueue(z_erofs_workqueue); + kmem_cache_destroy(z_erofs_workgroup_cachep); +} + +static inline int init_unzip_workqueue(void) +{ + const unsigned onlinecpus = num_possible_cpus(); + + /* + * we don't need too many threads, limiting threads + * could improve scheduling performance. + */ + z_erofs_workqueue = alloc_workqueue("erofs_unzipd", + WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, + onlinecpus + onlinecpus / 4); + + return z_erofs_workqueue != NULL ? 0 : -ENOMEM; +} + +int z_erofs_init_zip_subsystem(void) +{ + z_erofs_workgroup_cachep = + kmem_cache_create("erofs_compress", + Z_EROFS_WORKGROUP_SIZE, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + + if (z_erofs_workgroup_cachep != NULL) { + if (!init_unzip_workqueue()) + return 0; + + kmem_cache_destroy(z_erofs_workgroup_cachep); + } + return -ENOMEM; +} + +enum z_erofs_vle_work_role { + Z_EROFS_VLE_WORK_SECONDARY, + Z_EROFS_VLE_WORK_PRIMARY, + /* + * The current work has at least been linked with the following + * processed chained works, which means if the processing page + * is the tail partial page of the work, the current work can + * safely use the whole page, as illustrated below: + * +--------------+-------------------------------------------+ + * | tail page | head page (of the previous work) | + * +--------------+-------------------------------------------+ + * /\ which belongs to the current work + * [ (*) this page can be used for the current work itself. ] + */ + Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED, + Z_EROFS_VLE_WORK_MAX +}; + +struct z_erofs_vle_work_builder { + enum z_erofs_vle_work_role role; + /* + * 'hosted = false' means that the current workgroup doesn't belong to + * the owned chained workgroups. In the other words, it is none of our + * business to submit this workgroup. + */ + bool hosted; + + struct z_erofs_vle_workgroup *grp; + struct z_erofs_vle_work *work; + struct z_erofs_pagevec_ctor vector; + + /* pages used for reading the compressed data */ + struct page **compressed_pages; + unsigned compressed_deficit; +}; + +#define VLE_WORK_BUILDER_INIT() \ + { .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED } + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + +static bool grab_managed_cache_pages(struct address_space *mapping, + erofs_blk_t start, + struct page **compressed_pages, + int clusterblks, + bool reserve_allocation) +{ + bool noio = true; + unsigned int i; + + /* TODO: optimize by introducing find_get_pages_range */ + for (i = 0; i < clusterblks; ++i) { + struct page *page, *found; + + if (READ_ONCE(compressed_pages[i]) != NULL) + continue; + + page = found = find_get_page(mapping, start + i); + if (found == NULL) { + noio = false; + if (!reserve_allocation) + continue; + page = EROFS_UNALLOCATED_CACHED_PAGE; + } + + if (NULL == cmpxchg(compressed_pages + i, NULL, page)) + continue; + + if (found != NULL) + put_page(found); + } + return noio; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp) +{ + struct z_erofs_vle_workgroup *const grp = + container_of(egrp, struct z_erofs_vle_workgroup, obj); + struct address_space *const mapping = sbi->managed_cache->i_mapping; + const int clusterpages = erofs_clusterpages(sbi); + int i; + + /* + * refcount of workgroup is now freezed as 1, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < clusterpages; ++i) { + struct page *page = grp->compressed_pages[i]; + + if (page == NULL || page->mapping != mapping) + continue; + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) + return -EBUSY; + + /* barrier is implied in the following 'unlock_page' */ + WRITE_ONCE(grp->compressed_pages[i], NULL); + + set_page_private(page, 0); + ClearPagePrivate(page); + + unlock_page(page); + put_page(page); + } + return 0; +} + +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page) +{ + struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb); + const unsigned int clusterpages = erofs_clusterpages(sbi); + + struct z_erofs_vle_workgroup *grp; + int ret = 0; /* 0 - busy */ + + /* prevent the workgroup from being freed */ + rcu_read_lock(); + grp = (void *)page_private(page); + + if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) { + unsigned int i; + + for (i = 0; i < clusterpages; ++i) { + if (grp->compressed_pages[i] == page) { + WRITE_ONCE(grp->compressed_pages[i], NULL); + ret = 1; + break; + } + } + erofs_workgroup_unfreeze(&grp->obj, 1); + } + rcu_read_unlock(); + + if (ret) { + ClearPagePrivate(page); + put_page(page); + } + return ret; +} +#endif + +/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ +static inline bool try_to_reuse_as_compressed_page( + struct z_erofs_vle_work_builder *b, + struct page *page) +{ + while (b->compressed_deficit) { + --b->compressed_deficit; + if (NULL == cmpxchg(b->compressed_pages++, NULL, page)) + return true; + } + + return false; +} + +/* callers must be with work->lock held */ +static int z_erofs_vle_work_add_page( + struct z_erofs_vle_work_builder *builder, + struct page *page, + enum z_erofs_page_type type) +{ + int ret; + bool occupied; + + /* give priority for the compressed data storage */ + if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY && + type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && + try_to_reuse_as_compressed_page(builder, page)) + return 0; + + ret = z_erofs_pagevec_ctor_enqueue(&builder->vector, + page, type, &occupied); + builder->work->vcnt += (unsigned)ret; + + return ret ? 0 : -EAGAIN; +} + +static inline bool try_to_claim_workgroup( + struct z_erofs_vle_workgroup *grp, + z_erofs_vle_owned_workgrp_t *owned_head, + bool *hosted) +{ + DBG_BUGON(*hosted == true); + + /* let's claim these following types of workgroup */ +retry: + if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) { + /* type 1, nil workgroup */ + if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next, + Z_EROFS_VLE_WORKGRP_NIL, *owned_head)) + goto retry; + + *owned_head = grp; + *hosted = true; + } else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) { + /* + * type 2, link to the end of a existing open chain, + * be careful that its submission itself is governed + * by the original owned chain. + */ + if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next, + Z_EROFS_VLE_WORKGRP_TAIL, *owned_head)) + goto retry; + + *owned_head = Z_EROFS_VLE_WORKGRP_TAIL; + } else + return false; /* :( better luck next time */ + + return true; /* lucky, I am the followee :) */ +} + +static struct z_erofs_vle_work * +z_erofs_vle_work_lookup(struct super_block *sb, + pgoff_t idx, unsigned pageofs, + struct z_erofs_vle_workgroup **grp_ret, + enum z_erofs_vle_work_role *role, + z_erofs_vle_owned_workgrp_t *owned_head, + bool *hosted) +{ + bool tag, primary; + struct erofs_workgroup *egrp; + struct z_erofs_vle_workgroup *grp; + struct z_erofs_vle_work *work; + + egrp = erofs_find_workgroup(sb, idx, &tag); + if (egrp == NULL) { + *grp_ret = NULL; + return NULL; + } + + *grp_ret = grp = container_of(egrp, + struct z_erofs_vle_workgroup, obj); + +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + work = z_erofs_vle_grab_work(grp, pageofs); + primary = true; +#else + BUG(); +#endif + + DBG_BUGON(work->pageofs != pageofs); + + /* + * lock must be taken first to avoid grp->next == NIL between + * claiming workgroup and adding pages: + * grp->next != NIL + * grp->next = NIL + * mutex_unlock_all + * mutex_lock(&work->lock) + * add all pages to pagevec + * + * [correct locking case 1]: + * mutex_lock(grp->work[a]) + * ... + * mutex_lock(grp->work[b]) mutex_lock(grp->work[c]) + * ... *role = SECONDARY + * add all pages to pagevec + * ... + * mutex_unlock(grp->work[c]) + * mutex_lock(grp->work[c]) + * ... + * grp->next = NIL + * mutex_unlock_all + * + * [correct locking case 2]: + * mutex_lock(grp->work[b]) + * ... + * mutex_lock(grp->work[a]) + * ... + * mutex_lock(grp->work[c]) + * ... + * grp->next = NIL + * mutex_unlock_all + * mutex_lock(grp->work[a]) + * *role = PRIMARY_OWNER + * add all pages to pagevec + * ... + */ + mutex_lock(&work->lock); + + *hosted = false; + if (!primary) + *role = Z_EROFS_VLE_WORK_SECONDARY; + /* claim the workgroup if possible */ + else if (try_to_claim_workgroup(grp, owned_head, hosted)) + *role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED; + else + *role = Z_EROFS_VLE_WORK_PRIMARY; + + return work; +} + +static struct z_erofs_vle_work * +z_erofs_vle_work_register(struct super_block *sb, + struct z_erofs_vle_workgroup **grp_ret, + struct erofs_map_blocks *map, + pgoff_t index, unsigned pageofs, + enum z_erofs_vle_work_role *role, + z_erofs_vle_owned_workgrp_t *owned_head, + bool *hosted) +{ + bool newgrp = false; + struct z_erofs_vle_workgroup *grp = *grp_ret; + struct z_erofs_vle_work *work; + +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + BUG_ON(grp != NULL); +#else + if (grp != NULL) + goto skip; +#endif + /* no available workgroup, let's allocate one */ + grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS); + if (unlikely(grp == NULL)) + return ERR_PTR(-ENOMEM); + + grp->obj.index = index; + grp->llen = map->m_llen; + + z_erofs_vle_set_workgrp_fmt(grp, + (map->m_flags & EROFS_MAP_ZIPPED) ? + Z_EROFS_VLE_WORKGRP_FMT_LZ4 : + Z_EROFS_VLE_WORKGRP_FMT_PLAIN); + atomic_set(&grp->obj.refcount, 1); + + /* new workgrps have been claimed as type 1 */ + WRITE_ONCE(grp->next, *owned_head); + /* primary and followed work for all new workgrps */ + *role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED; + /* it should be submitted by ourselves */ + *hosted = true; + + newgrp = true; +#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF +skip: + /* currently unimplemented */ + BUG(); +#else + work = z_erofs_vle_grab_primary_work(grp); +#endif + work->pageofs = pageofs; + + mutex_init(&work->lock); + + if (newgrp) { + int err = erofs_register_workgroup(sb, &grp->obj, 0); + + if (err) { + kmem_cache_free(z_erofs_workgroup_cachep, grp); + return ERR_PTR(-EAGAIN); + } + } + + *owned_head = *grp_ret = grp; + + mutex_lock(&work->lock); + return work; +} + +static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp, + unsigned int llen) +{ + while (1) { + unsigned int orig_llen = grp->llen; + + if (orig_llen >= llen || orig_llen == + cmpxchg(&grp->llen, orig_llen, llen)) + break; + } +} + +#define builder_is_followed(builder) \ + ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED) + +static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder, + struct super_block *sb, + struct erofs_map_blocks *map, + z_erofs_vle_owned_workgrp_t *owned_head) +{ + const unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb)); + const erofs_blk_t index = erofs_blknr(map->m_pa); + const unsigned pageofs = map->m_la & ~PAGE_MASK; + struct z_erofs_vle_workgroup *grp; + struct z_erofs_vle_work *work; + + DBG_BUGON(builder->work != NULL); + + /* must be Z_EROFS_WORK_TAIL or the next chained work */ + DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL); + DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + + DBG_BUGON(erofs_blkoff(map->m_pa)); + +repeat: + work = z_erofs_vle_work_lookup(sb, index, + pageofs, &grp, &builder->role, owned_head, &builder->hosted); + if (work != NULL) { + __update_workgrp_llen(grp, map->m_llen); + goto got_it; + } + + work = z_erofs_vle_work_register(sb, &grp, map, index, pageofs, + &builder->role, owned_head, &builder->hosted); + + if (unlikely(work == ERR_PTR(-EAGAIN))) + goto repeat; + + if (unlikely(IS_ERR(work))) + return PTR_ERR(work); +got_it: + z_erofs_pagevec_ctor_init(&builder->vector, + Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt); + + if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) { + /* enable possibly in-place decompression */ + builder->compressed_pages = grp->compressed_pages; + builder->compressed_deficit = clusterpages; + } else { + builder->compressed_pages = NULL; + builder->compressed_deficit = 0; + } + + builder->grp = grp; + builder->work = work; + return 0; +} + +/* + * keep in mind that no referenced workgroups will be freed + * only after a RCU grace period, so rcu_read_lock() could + * prevent a workgroup from being freed. + */ +static void z_erofs_rcu_callback(struct rcu_head *head) +{ + struct z_erofs_vle_work *work = container_of(head, + struct z_erofs_vle_work, rcu); + struct z_erofs_vle_workgroup *grp = + z_erofs_vle_work_workgroup(work, true); + + kmem_cache_free(z_erofs_workgroup_cachep, grp); +} + +void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +{ + struct z_erofs_vle_workgroup *const vgrp = container_of(grp, + struct z_erofs_vle_workgroup, obj); + struct z_erofs_vle_work *const work = &vgrp->work; + + call_rcu(&work->rcu, z_erofs_rcu_callback); +} + +static void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp, + struct z_erofs_vle_work *work __maybe_unused) +{ + erofs_workgroup_put(&grp->obj); +} + +void z_erofs_vle_work_release(struct z_erofs_vle_work *work) +{ + struct z_erofs_vle_workgroup *grp = + z_erofs_vle_work_workgroup(work, true); + + __z_erofs_vle_work_release(grp, work); +} + +static inline bool +z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder) +{ + struct z_erofs_vle_work *work = builder->work; + + if (work == NULL) + return false; + + z_erofs_pagevec_ctor_exit(&builder->vector, false); + mutex_unlock(&work->lock); + + /* + * if all pending pages are added, don't hold work reference + * any longer if the current work isn't hosted by ourselves. + */ + if (!builder->hosted) + __z_erofs_vle_work_release(builder->grp, work); + + builder->work = NULL; + builder->grp = NULL; + return true; +} + +static inline struct page *__stagingpage_alloc(struct list_head *pagepool, + gfp_t gfp) +{ + struct page *page = erofs_allocpage(pagepool, gfp); + + if (unlikely(page == NULL)) + return NULL; + + page->mapping = Z_EROFS_MAPPING_STAGING; + return page; +} + +struct z_erofs_vle_frontend { + struct inode *const inode; + + struct z_erofs_vle_work_builder builder; + struct erofs_map_blocks_iter m_iter; + + z_erofs_vle_owned_workgrp_t owned_head; + + bool initial; +#if (EROFS_FS_ZIP_CACHE_LVL >= 2) + erofs_off_t cachedzone_la; +#endif +}; + +#define VLE_FRONTEND_INIT(__i) { \ + .inode = __i, \ + .m_iter = { \ + { .m_llen = 0, .m_plen = 0 }, \ + .mpage = NULL \ + }, \ + .builder = VLE_WORK_BUILDER_INIT(), \ + .owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \ + .initial = true, } + +static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe, + struct page *page, + struct list_head *page_pool) +{ + struct super_block *const sb = fe->inode->i_sb; + struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + struct erofs_map_blocks_iter *const m = &fe->m_iter; + struct erofs_map_blocks *const map = &m->map; + struct z_erofs_vle_work_builder *const builder = &fe->builder; + const loff_t offset = page_offset(page); + + bool tight = builder_is_followed(builder); + struct z_erofs_vle_work *work = builder->work; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *const mngda = sbi->managed_cache->i_mapping; + struct z_erofs_vle_workgroup *grp; + bool noio_outoforder; +#endif + + enum z_erofs_page_type page_type; + unsigned cur, end, spiltted, index; + int err; + + /* register locked file pages as online pages in pack */ + z_erofs_onlinepage_init(page); + + spiltted = 0; + end = PAGE_SIZE; +repeat: + cur = end - 1; + + /* lucky, within the range of the current map_blocks */ + if (offset + cur >= map->m_la && + offset + cur < map->m_la + map->m_llen) + goto hitted; + + /* go ahead the next map_blocks */ + debugln("%s: [out-of-range] pos %llu", __func__, offset + cur); + + if (!z_erofs_vle_work_iter_end(builder)) + fe->initial = false; + + map->m_la = offset + cur; + map->m_llen = 0; + err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0); + if (unlikely(err)) + goto err_out; + + /* deal with hole (FIXME! broken now) */ + if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) + goto hitted; + + DBG_BUGON(map->m_plen != 1 << sbi->clusterbits); + BUG_ON(erofs_blkoff(map->m_pa)); + + err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head); + if (unlikely(err)) + goto err_out; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + grp = fe->builder.grp; + + /* let's do out-of-order decompression for noio */ + noio_outoforder = grab_managed_cache_pages(mngda, + erofs_blknr(map->m_pa), + grp->compressed_pages, erofs_blknr(map->m_plen), + /* compressed page caching selection strategy */ + fe->initial | (EROFS_FS_ZIP_CACHE_LVL >= 2 ? + map->m_la < fe->cachedzone_la : 0)); + + if (noio_outoforder && builder_is_followed(builder)) + builder->role = Z_EROFS_VLE_WORK_PRIMARY; +#endif + + tight &= builder_is_followed(builder); + work = builder->work; +hitted: + cur = end - min_t(unsigned, offset + end - map->m_la, end); + if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) { + zero_user_segment(page, cur, end); + goto next_part; + } + + /* let's derive page type */ + page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : + (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : + Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); + +retry: + err = z_erofs_vle_work_add_page(builder, page, page_type); + /* should allocate an additional staging page for pagevec */ + if (err == -EAGAIN) { + struct page *const newpage = + __stagingpage_alloc(page_pool, GFP_NOFS); + + err = z_erofs_vle_work_add_page(builder, + newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE); + if (!err) + goto retry; + } + + if (unlikely(err)) + goto err_out; + + index = page->index - map->m_la / PAGE_SIZE; + + /* FIXME! avoid the last relundant fixup & endio */ + z_erofs_onlinepage_fixup(page, index, true); + ++spiltted; + + /* also update nr_pages and increase queued_pages */ + work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1); +next_part: + /* can be used for verification */ + map->m_llen = offset + cur - map->m_la; + + end = cur; + if (end > 0) + goto repeat; + + /* FIXME! avoid the last relundant fixup & endio */ + z_erofs_onlinepage_endio(page); + + debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu", + __func__, page, spiltted, map->m_llen); + return 0; + +err_out: + /* TODO: the missing error handing cases */ + return err; +} + +static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +{ + tagptr1_t t = tagptr_init(tagptr1_t, ptr); + struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t); + bool background = tagptr_unfold_tags(t); + + if (atomic_add_return(bios, &io->pending_bios)) + return; + + if (background) + queue_work(z_erofs_workqueue, &io->u.work); + else + wake_up(&io->u.wait); +} + +static inline void z_erofs_vle_read_endio(struct bio *bio) +{ + const blk_status_t err = bio->bi_status; + unsigned i; + struct bio_vec *bvec; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *mngda = NULL; +#endif + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + bool cachemngd = false; + + DBG_BUGON(PageUptodate(page)); + BUG_ON(page->mapping == NULL); + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (unlikely(mngda == NULL && !z_erofs_is_stagingpage(page))) { + struct inode *const inode = page->mapping->host; + struct super_block *const sb = inode->i_sb; + + mngda = EROFS_SB(sb)->managed_cache->i_mapping; + } + + /* + * If mngda has not gotten, it equals NULL, + * however, page->mapping never be NULL if working properly. + */ + cachemngd = (page->mapping == mngda); +#endif + + if (unlikely(err)) + SetPageError(page); + else if (cachemngd) + SetPageUptodate(page); + + if (cachemngd) + unlock_page(page); + } + + z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + bio_put(bio); +} + +static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES]; +static DEFINE_MUTEX(z_pagemap_global_lock); + +static int z_erofs_vle_unzip(struct super_block *sb, + struct z_erofs_vle_workgroup *grp, + struct list_head *page_pool) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *const mngda = sbi->managed_cache->i_mapping; +#endif + const unsigned clusterpages = erofs_clusterpages(sbi); + + struct z_erofs_pagevec_ctor ctor; + unsigned nr_pages; +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + unsigned sparsemem_pages = 0; +#endif + struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES]; + struct page **pages, **compressed_pages, *page; + unsigned i, llen; + + enum z_erofs_page_type page_type; + bool overlapped; + struct z_erofs_vle_work *work; + void *vout; + int err; + + might_sleep(); +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + work = z_erofs_vle_grab_primary_work(grp); +#else + BUG(); +#endif + BUG_ON(!READ_ONCE(work->nr_pages)); + + mutex_lock(&work->lock); + nr_pages = work->nr_pages; + + if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES)) + pages = pages_onstack; + else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES && + mutex_trylock(&z_pagemap_global_lock)) + pages = z_pagemap_global; + else { +repeat: + pages = kvmalloc_array(nr_pages, + sizeof(struct page *), GFP_KERNEL); + + /* fallback to global pagemap for the lowmem scenario */ + if (unlikely(pages == NULL)) { + if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES) + goto repeat; + else { + mutex_lock(&z_pagemap_global_lock); + pages = z_pagemap_global; + } + } + } + + for (i = 0; i < nr_pages; ++i) + pages[i] = NULL; + + z_erofs_pagevec_ctor_init(&ctor, + Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0); + + for (i = 0; i < work->vcnt; ++i) { + unsigned pagenr; + + page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type); + + /* all pages in pagevec ought to be valid */ + DBG_BUGON(page == NULL); + DBG_BUGON(page->mapping == NULL); + + if (z_erofs_gather_if_stagingpage(page_pool, page)) + continue; + + if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) + pagenr = 0; + else + pagenr = z_erofs_onlinepage_index(page); + + BUG_ON(pagenr >= nr_pages); + +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + BUG_ON(pages[pagenr] != NULL); + ++sparsemem_pages; +#endif + pages[pagenr] = page; + } + + z_erofs_pagevec_ctor_exit(&ctor, true); + + overlapped = false; + compressed_pages = grp->compressed_pages; + + for (i = 0; i < clusterpages; ++i) { + unsigned pagenr; + + page = compressed_pages[i]; + + /* all compressed pages ought to be valid */ + DBG_BUGON(page == NULL); + DBG_BUGON(page->mapping == NULL); + + if (z_erofs_is_stagingpage(page)) + continue; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + else if (page->mapping == mngda) { + BUG_ON(PageLocked(page)); + BUG_ON(!PageUptodate(page)); + continue; + } +#endif + + /* only non-head page could be reused as a compressed page */ + pagenr = z_erofs_onlinepage_index(page); + + BUG_ON(pagenr >= nr_pages); +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + BUG_ON(pages[pagenr] != NULL); + ++sparsemem_pages; +#endif + pages[pagenr] = page; + + overlapped = true; + } + + llen = (nr_pages << PAGE_SHIFT) - work->pageofs; + + if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) { + /* FIXME! this should be fixed in the future */ + BUG_ON(grp->llen != llen); + + err = z_erofs_vle_plain_copy(compressed_pages, clusterpages, + pages, nr_pages, work->pageofs); + goto out; + } + + if (llen > grp->llen) + llen = grp->llen; + + err = z_erofs_vle_unzip_fast_percpu(compressed_pages, + clusterpages, pages, llen, work->pageofs, + z_erofs_onlinepage_endio); + if (err != -ENOTSUPP) + goto out_percpu; + +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF + if (sparsemem_pages >= nr_pages) { + BUG_ON(sparsemem_pages > nr_pages); + goto skip_allocpage; + } +#endif + + for (i = 0; i < nr_pages; ++i) { + if (pages[i] != NULL) + continue; + + pages[i] = __stagingpage_alloc(page_pool, GFP_NOFS); + } + +#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF +skip_allocpage: +#endif + vout = erofs_vmap(pages, nr_pages); + + err = z_erofs_vle_unzip_vmap(compressed_pages, + clusterpages, vout, llen, work->pageofs, overlapped); + + erofs_vunmap(vout, nr_pages); + +out: + for (i = 0; i < nr_pages; ++i) { + page = pages[i]; + DBG_BUGON(page->mapping == NULL); + + /* recycle all individual staging pages */ + if (z_erofs_gather_if_stagingpage(page_pool, page)) + continue; + + if (unlikely(err < 0)) + SetPageError(page); + + z_erofs_onlinepage_endio(page); + } + +out_percpu: + for (i = 0; i < clusterpages; ++i) { + page = compressed_pages[i]; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (page->mapping == mngda) + continue; +#endif + /* recycle all individual staging pages */ + (void)z_erofs_gather_if_stagingpage(page_pool, page); + + WRITE_ONCE(compressed_pages[i], NULL); + } + + if (pages == z_pagemap_global) + mutex_unlock(&z_pagemap_global_lock); + else if (unlikely(pages != pages_onstack)) + kvfree(pages); + + work->nr_pages = 0; + work->vcnt = 0; + + /* all work locks MUST be taken before the following line */ + + WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL); + + /* all work locks SHOULD be released right now */ + mutex_unlock(&work->lock); + + z_erofs_vle_work_release(work); + return err; +} + +static void z_erofs_vle_unzip_all(struct super_block *sb, + struct z_erofs_vle_unzip_io *io, + struct list_head *page_pool) +{ + z_erofs_vle_owned_workgrp_t owned = io->head; + + while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) { + struct z_erofs_vle_workgroup *grp; + + /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL); + + /* no possible that 'owned' equals NULL */ + DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL); + + grp = owned; + owned = READ_ONCE(grp->next); + + z_erofs_vle_unzip(sb, grp, page_pool); + } +} + +static void z_erofs_vle_unzip_wq(struct work_struct *work) +{ + struct z_erofs_vle_unzip_io_sb *iosb = container_of(work, + struct z_erofs_vle_unzip_io_sb, io.u.work); + LIST_HEAD(page_pool); + + BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool); + + put_pages_list(&page_pool); + kvfree(iosb); +} + +static inline struct z_erofs_vle_unzip_io * +prepare_io_handler(struct super_block *sb, + struct z_erofs_vle_unzip_io *io, + bool background) +{ + struct z_erofs_vle_unzip_io_sb *iosb; + + if (!background) { + /* waitqueue available for foreground io */ + BUG_ON(io == NULL); + + init_waitqueue_head(&io->u.wait); + atomic_set(&io->pending_bios, 0); + goto out; + } + + if (io != NULL) + BUG(); + else { + /* allocate extra io descriptor for background io */ + iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb), + GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(iosb == NULL); + + io = &iosb->io; + } + + iosb->sb = sb; + INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); +out: + io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED; + return io; +} + +#ifdef EROFS_FS_HAS_MANAGED_CACHE +/* true - unlocked (noio), false - locked (need submit io) */ +static inline bool recover_managed_page(struct z_erofs_vle_workgroup *grp, + struct page *page) +{ + wait_on_page_locked(page); + if (PagePrivate(page) && PageUptodate(page)) + return true; + + lock_page(page); + if (unlikely(!PagePrivate(page))) { + set_page_private(page, (unsigned long)grp); + SetPagePrivate(page); + } + if (unlikely(PageUptodate(page))) { + unlock_page(page); + return true; + } + return false; +} + +#define __FSIO_1 1 +#else +#define __FSIO_1 0 +#endif + +static bool z_erofs_vle_submit_all(struct super_block *sb, + z_erofs_vle_owned_workgrp_t owned_head, + struct list_head *pagepool, + struct z_erofs_vle_unzip_io *fg_io, + bool force_fg) +{ + struct erofs_sb_info *const sbi = EROFS_SB(sb); + const unsigned clusterpages = erofs_clusterpages(sbi); + const gfp_t gfp = GFP_NOFS; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct address_space *const mngda = sbi->managed_cache->i_mapping; + struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL; +#endif + struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1]; + struct bio *bio; + tagptr1_t bi_private; + /* since bio will be NULL, no need to initialize last_index */ + pgoff_t uninitialized_var(last_index); + bool force_submit = false; + unsigned nr_bios; + + if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL)) + return false; + + /* + * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io + * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io + */ +#ifdef EROFS_FS_HAS_MANAGED_CACHE + ios[0] = prepare_io_handler(sb, fg_io + 0, false); +#endif + + if (force_fg) { + ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false); + bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0); + } else { + ios[__FSIO_1] = prepare_io_handler(sb, NULL, true); + bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1); + } + + nr_bios = 0; + force_submit = false; + bio = NULL; + + /* by default, all need io submission */ + ios[__FSIO_1]->head = owned_head; + + do { + struct z_erofs_vle_workgroup *grp; + struct page **compressed_pages, *oldpage, *page; + pgoff_t first_index; + unsigned i = 0; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + unsigned int noio = 0; + bool cachemngd; +#endif + int err; + + /* no possible 'owned_head' equals the following */ + DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL); + + grp = owned_head; + + /* close the main owned chain at first */ + owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL, + Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + + first_index = grp->obj.index; + compressed_pages = grp->compressed_pages; + + force_submit |= (first_index != last_index + 1); +repeat: + /* fulfill all compressed pages */ + oldpage = page = READ_ONCE(compressed_pages[i]); + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + cachemngd = false; + + if (page == EROFS_UNALLOCATED_CACHED_PAGE) { + cachemngd = true; + goto do_allocpage; + } else if (page != NULL) { + if (page->mapping != mngda) + BUG_ON(PageUptodate(page)); + else if (recover_managed_page(grp, page)) { + /* page is uptodate, skip io submission */ + force_submit = true; + ++noio; + goto skippage; + } + } else { +do_allocpage: +#else + if (page != NULL) + BUG_ON(PageUptodate(page)); + else { +#endif + page = __stagingpage_alloc(pagepool, gfp); + + if (oldpage != cmpxchg(compressed_pages + i, + oldpage, page)) { + list_add(&page->lru, pagepool); + goto repeat; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + } else if (cachemngd && !add_to_page_cache_lru(page, + mngda, first_index + i, gfp)) { + set_page_private(page, (unsigned long)grp); + SetPagePrivate(page); +#endif + } + } + + if (bio != NULL && force_submit) { +submit_bio_retry: + __submit_bio(bio, REQ_OP_READ, 0); + bio = NULL; + } + + if (bio == NULL) { + bio = prepare_bio(sb, first_index + i, + BIO_MAX_PAGES, z_erofs_vle_read_endio); + bio->bi_private = tagptr_cast_ptr(bi_private); + + ++nr_bios; + } + + err = bio_add_page(bio, page, PAGE_SIZE, 0); + if (err < PAGE_SIZE) + goto submit_bio_retry; + + force_submit = false; + last_index = first_index + i; +#ifdef EROFS_FS_HAS_MANAGED_CACHE +skippage: +#endif + if (++i < clusterpages) + goto repeat; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (noio < clusterpages) { + lstgrp_io = grp; + } else { + z_erofs_vle_owned_workgrp_t iogrp_next = + owned_head == Z_EROFS_VLE_WORKGRP_TAIL ? + Z_EROFS_VLE_WORKGRP_TAIL_CLOSED : + owned_head; + + if (lstgrp_io == NULL) + ios[1]->head = iogrp_next; + else + WRITE_ONCE(lstgrp_io->next, iogrp_next); + + if (lstgrp_noio == NULL) + ios[0]->head = grp; + else + WRITE_ONCE(lstgrp_noio->next, grp); + + lstgrp_noio = grp; + } +#endif + } while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL); + + if (bio != NULL) + __submit_bio(bio, REQ_OP_READ, 0); + +#ifndef EROFS_FS_HAS_MANAGED_CACHE + BUG_ON(!nr_bios); +#else + if (lstgrp_noio != NULL) + WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED); + + if (!force_fg && !nr_bios) { + kvfree(container_of(ios[1], + struct z_erofs_vle_unzip_io_sb, io)); + return true; + } +#endif + + z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios); + return true; +} + +static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f, + struct list_head *pagepool, + bool force_fg) +{ + struct super_block *sb = f->inode->i_sb; + struct z_erofs_vle_unzip_io io[1 + __FSIO_1]; + + if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg)) + return; + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + z_erofs_vle_unzip_all(sb, &io[0], pagepool); +#endif + if (!force_fg) + return; + + /* wait until all bios are completed */ + wait_event(io[__FSIO_1].u.wait, + !atomic_read(&io[__FSIO_1].pending_bios)); + + /* let's synchronous decompression */ + z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool); +} + +static int z_erofs_vle_normalaccess_readpage(struct file *file, + struct page *page) +{ + struct inode *const inode = page->mapping->host; + struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode); + int err; + LIST_HEAD(pagepool); + +#if (EROFS_FS_ZIP_CACHE_LVL >= 2) + f.cachedzone_la = page->index << PAGE_SHIFT; +#endif + err = z_erofs_do_read_page(&f, page, &pagepool); + (void)z_erofs_vle_work_iter_end(&f.builder); + + if (err) { + errln("%s, failed to read, err [%d]", __func__, err); + goto out; + } + + z_erofs_submit_and_unzip(&f, &pagepool, true); +out: + if (f.m_iter.mpage != NULL) + put_page(f.m_iter.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return 0; +} + +static inline int __z_erofs_vle_normalaccess_readpages( + struct file *filp, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, bool sync) +{ + struct inode *const inode = mapping->host; + + struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + struct page *head = NULL; + LIST_HEAD(pagepool); + +#if (EROFS_FS_ZIP_CACHE_LVL >= 2) + f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT; +#endif + for (; nr_pages; --nr_pages) { + struct page *page = lru_to_page(pages); + + prefetchw(&page->flags); + list_del(&page->lru); + + if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { + list_add(&page->lru, &pagepool); + continue; + } + + BUG_ON(PagePrivate(page)); + set_page_private(page, (unsigned long)head); + head = page; + } + + while (head != NULL) { + struct page *page = head; + int err; + + /* traversal in reverse order */ + head = (void *)page_private(page); + + err = z_erofs_do_read_page(&f, page, &pagepool); + if (err) { + struct erofs_vnode *vi = EROFS_V(inode); + + errln("%s, readahead error at page %lu of nid %llu", + __func__, page->index, vi->nid); + } + + put_page(page); + } + + (void)z_erofs_vle_work_iter_end(&f.builder); + + z_erofs_submit_and_unzip(&f, &pagepool, sync); + + if (f.m_iter.mpage != NULL) + put_page(f.m_iter.mpage); + + /* clean up the remaining free pages */ + put_pages_list(&pagepool); + return 0; +} + +static int z_erofs_vle_normalaccess_readpages( + struct file *filp, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return __z_erofs_vle_normalaccess_readpages(filp, + mapping, pages, nr_pages, + nr_pages < 4 /* sync */); +} + +const struct address_space_operations z_erofs_vle_normalaccess_aops = { + .readpage = z_erofs_vle_normalaccess_readpage, + .readpages = z_erofs_vle_normalaccess_readpages, +}; + +#define __vle_cluster_advise(x, bit, bits) \ + ((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1)) + +#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \ + Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT, Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) + +enum { + Z_EROFS_VLE_CLUSTER_TYPE_PLAIN, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD, + Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD, + Z_EROFS_VLE_CLUSTER_TYPE_RESERVED, + Z_EROFS_VLE_CLUSTER_TYPE_MAX +}; + +#define vle_cluster_type(di) \ + __vle_cluster_type((di)->di_advise) + +static inline unsigned +vle_compressed_index_clusterofs(unsigned clustersize, + struct z_erofs_vle_decompressed_index *di) +{ + debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x", + __func__, di, di->di_advise, vle_cluster_type(di), + di->di_clusterofs, di->di_u.blkaddr); + + switch (vle_cluster_type(di)) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + break; + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + return di->di_clusterofs; + default: + BUG_ON(1); + } + return clustersize; +} + +static inline erofs_blk_t +vle_extent_blkaddr(struct inode *inode, pgoff_t index) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + struct erofs_vnode *vi = EROFS_V(inode); + + unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize + + vi->xattr_isize) + sizeof(struct erofs_extent_header) + + index * sizeof(struct z_erofs_vle_decompressed_index); + + return erofs_blknr(iloc(sbi, vi->nid) + ofs); +} + +static inline unsigned int +vle_extent_blkoff(struct inode *inode, pgoff_t index) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + struct erofs_vnode *vi = EROFS_V(inode); + + unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize + + vi->xattr_isize) + sizeof(struct erofs_extent_header) + + index * sizeof(struct z_erofs_vle_decompressed_index); + + return erofs_blkoff(iloc(sbi, vi->nid) + ofs); +} + +/* + * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode + * --- + * VLE compression mode attempts to compress a number of logical data into + * a physical cluster with a fixed size. + * VLE compression mode uses "struct z_erofs_vle_decompressed_index". + */ +static erofs_off_t vle_get_logical_extent_head( + struct inode *inode, + struct page **page_iter, + void **kaddr_iter, + unsigned lcn, /* logical cluster number */ + erofs_blk_t *pcn, + unsigned *flags) +{ + /* for extent meta */ + struct page *page = *page_iter; + erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn); + struct z_erofs_vle_decompressed_index *di; + unsigned long long ofs; + const unsigned int clusterbits = EROFS_SB(inode->i_sb)->clusterbits; + const unsigned int clustersize = 1 << clusterbits; + + if (page->index != blkaddr) { + kunmap_atomic(*kaddr_iter); + unlock_page(page); + put_page(page); + + *page_iter = page = erofs_get_meta_page(inode->i_sb, + blkaddr, false); + *kaddr_iter = kmap_atomic(page); + } + + di = *kaddr_iter + vle_extent_blkoff(inode, lcn); + switch (vle_cluster_type(di)) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + BUG_ON(!di->di_u.delta[0]); + BUG_ON(lcn < di->di_u.delta[0]); + + ofs = vle_get_logical_extent_head(inode, + page_iter, kaddr_iter, + lcn - di->di_u.delta[0], pcn, flags); + break; + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + *flags ^= EROFS_MAP_ZIPPED; + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + /* clustersize should be a power of two */ + ofs = ((unsigned long long)lcn << clusterbits) + + (le16_to_cpu(di->di_clusterofs) & (clustersize - 1)); + *pcn = le32_to_cpu(di->di_u.blkaddr); + break; + default: + BUG_ON(1); + } + return ofs; +} + +int z_erofs_map_blocks_iter(struct inode *inode, + struct erofs_map_blocks *map, + struct page **mpage_ret, int flags) +{ + /* logicial extent (start, end) offset */ + unsigned long long ofs, end; + struct z_erofs_vle_decompressed_index *di; + erofs_blk_t e_blkaddr, pcn; + unsigned lcn, logical_cluster_ofs, cluster_type; + u32 ofs_rem; + struct page *mpage = *mpage_ret; + void *kaddr; + bool initial; + const unsigned int clusterbits = EROFS_SB(inode->i_sb)->clusterbits; + const unsigned int clustersize = 1 << clusterbits; + int err = 0; + + /* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */ + initial = !map->m_llen; + + /* when trying to read beyond EOF, leave it unmapped */ + if (unlikely(map->m_la >= inode->i_size)) { + BUG_ON(!initial); + map->m_llen = map->m_la + 1 - inode->i_size; + map->m_la = inode->i_size - 1; + map->m_flags = 0; + goto out; + } + + debugln("%s, m_la %llu m_llen %llu --- start", __func__, + map->m_la, map->m_llen); + + ofs = map->m_la + map->m_llen; + + /* clustersize should be power of two */ + lcn = ofs >> clusterbits; + ofs_rem = ofs & (clustersize - 1); + + e_blkaddr = vle_extent_blkaddr(inode, lcn); + + if (mpage == NULL || mpage->index != e_blkaddr) { + if (mpage != NULL) + put_page(mpage); + + mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false); + *mpage_ret = mpage; + } else { + lock_page(mpage); + DBG_BUGON(!PageUptodate(mpage)); + } + + kaddr = kmap_atomic(mpage); + di = kaddr + vle_extent_blkoff(inode, lcn); + + debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn, + e_blkaddr, vle_extent_blkoff(inode, lcn)); + + logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di); + if (!initial) { + /* [walking mode] 'map' has been already initialized */ + map->m_llen += logical_cluster_ofs; + goto unmap_out; + } + + /* by default, compressed */ + map->m_flags |= EROFS_MAP_ZIPPED; + + end = (u64)(lcn + 1) * clustersize; + + cluster_type = vle_cluster_type(di); + + switch (cluster_type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + if (ofs_rem >= logical_cluster_ofs) + map->m_flags ^= EROFS_MAP_ZIPPED; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + if (ofs_rem == logical_cluster_ofs) { + pcn = le32_to_cpu(di->di_u.blkaddr); + goto exact_hitted; + } + + if (ofs_rem > logical_cluster_ofs) { + ofs = lcn * clustersize | logical_cluster_ofs; + pcn = le32_to_cpu(di->di_u.blkaddr); + break; + } + + /* logical cluster number should be >= 1 */ + if (unlikely(!lcn)) { + errln("invalid logical cluster 0 at nid %llu", + EROFS_V(inode)->nid); + err = -EIO; + goto unmap_out; + } + end = (lcn-- * clustersize) | logical_cluster_ofs; + /* fallthrough */ + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + /* get the correspoinding first chunk */ + ofs = vle_get_logical_extent_head(inode, mpage_ret, + &kaddr, lcn, &pcn, &map->m_flags); + mpage = *mpage_ret; + break; + default: + errln("unknown cluster type %u at offset %llu of nid %llu", + cluster_type, ofs, EROFS_V(inode)->nid); + err = -EIO; + goto unmap_out; + } + + map->m_la = ofs; +exact_hitted: + map->m_llen = end - ofs; + map->m_plen = clustersize; + map->m_pa = blknr_to_addr(pcn); + map->m_flags |= EROFS_MAP_MAPPED; +unmap_out: + kunmap_atomic(kaddr); + unlock_page(mpage); +out: + debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", + __func__, map->m_la, map->m_pa, + map->m_llen, map->m_plen, map->m_flags); + + /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ + DBG_BUGON(err < 0); + return err; +} + diff --git a/drivers/staging/erofs/unzip_vle.h b/drivers/staging/erofs/unzip_vle.h new file mode 100644 index 000000000000..393998500865 --- /dev/null +++ b/drivers/staging/erofs/unzip_vle.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/unzip_vle.h + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __EROFS_FS_UNZIP_VLE_H +#define __EROFS_FS_UNZIP_VLE_H + +#include "internal.h" +#include "unzip_pagevec.h" + +/* + * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - + * used for temporary allocated pages (via erofs_allocpage), + * in order to seperate those from NULL mapping (eg. truncated pages) + */ +#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) + +#define z_erofs_is_stagingpage(page) \ + ((page)->mapping == Z_EROFS_MAPPING_STAGING) + +static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool, + struct page *page) +{ + if (z_erofs_is_stagingpage(page)) { + list_add(&page->lru, page_pool); + return true; + } + return false; +} + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else. + * + */ + +#define Z_EROFS_VLE_INLINE_PAGEVECS 3 + +struct z_erofs_vle_work { + /* struct z_erofs_vle_work *left, *right; */ + +#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF + struct list_head list; + + atomic_t refcount; +#endif + struct mutex lock; + + /* I: decompression offset in page */ + unsigned short pageofs; + unsigned short nr_pages; + + /* L: queued pages in pagevec[] */ + unsigned vcnt; + + union { + /* L: pagevec */ + erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS]; + struct rcu_head rcu; + }; +}; + +#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN 0 +#define Z_EROFS_VLE_WORKGRP_FMT_LZ4 1 +#define Z_EROFS_VLE_WORKGRP_FMT_MASK 1 + +typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t; + +struct z_erofs_vle_workgroup { + struct erofs_workgroup obj; + struct z_erofs_vle_work work; + + /* next owned workgroup */ + z_erofs_vle_owned_workgrp_t next; + + /* compressed pages (including multi-usage pages) */ + struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; + unsigned int llen, flags; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_VLE_WORKGRP_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_VLE_WORKGRP_NIL (NULL) + +#define z_erofs_vle_workgrp_fmt(grp) \ + ((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK) + +static inline void z_erofs_vle_set_workgrp_fmt( + struct z_erofs_vle_workgroup *grp, + unsigned int fmt) +{ + grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK); +} + +#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF +#error multiref decompression is unimplemented yet +#else + +#define z_erofs_vle_grab_primary_work(grp) (&(grp)->work) +#define z_erofs_vle_grab_work(grp, pageofs) (&(grp)->work) +#define z_erofs_vle_work_workgroup(wrk, primary) \ + ((primary) ? container_of(wrk, \ + struct z_erofs_vle_workgroup, work) : \ + ({ BUG(); (void *)NULL; })) + +#endif + +#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_vle_workgroup) + +struct z_erofs_vle_unzip_io { + atomic_t pending_bios; + z_erofs_vle_owned_workgrp_t head; + + union { + wait_queue_head_t wait; + struct work_struct work; + } u; +}; + +struct z_erofs_vle_unzip_io_sb { + struct z_erofs_vle_unzip_io io; + struct super_block *sb; +}; + +#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 +#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) +#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) + +/* + * waiters (aka. ongoing_packs): # to unlock the page + * sub-index: 0 - for partial page, >= 1 full page sub-index + */ +typedef atomic_t z_erofs_onlinepage_t; + +/* type punning */ +union z_erofs_onlinepage_converter { + z_erofs_onlinepage_t *o; + unsigned long *v; +}; + +static inline unsigned z_erofs_onlinepage_index(struct page *page) +{ + union z_erofs_onlinepage_converter u; + + BUG_ON(!PagePrivate(page)); + u.v = &page_private(page); + + return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; +} + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + z_erofs_onlinepage_t o; + unsigned long v; + /* keep from being unlocked in advance */ + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_fixup(struct page *page, + uintptr_t index, bool down) +{ + unsigned long *p, o, v, id; +repeat: + p = &page_private(page); + o = READ_ONCE(*p); + + id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (id) { + if (!index) + return; + + BUG_ON(id != index); + } + + v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down); + if (cmpxchg(p, o, v) != o) + goto repeat; +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + union z_erofs_onlinepage_converter u; + unsigned v; + + BUG_ON(!PagePrivate(page)); + u.v = &page_private(page); + + v = atomic_dec_return(u.o); + if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + ClearPagePrivate(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + + debugln("%s, page %p value %x", __func__, page, atomic_read(u.o)); +} + +#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES \ + min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) +#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES 2048 + +/* unzip_vle_lz4.c */ +extern int z_erofs_vle_plain_copy(struct page **compressed_pages, + unsigned clusterpages, struct page **pages, + unsigned nr_pages, unsigned short pageofs); + +extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages, + unsigned clusterpages, struct page **pages, + unsigned outlen, unsigned short pageofs, + void (*endio)(struct page *)); + +extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages, + unsigned clusterpages, void *vaddr, unsigned llen, + unsigned short pageofs, bool overlapped); + +#endif + diff --git a/drivers/staging/erofs/unzip_vle_lz4.c b/drivers/staging/erofs/unzip_vle_lz4.c new file mode 100644 index 000000000000..f5b665f15be5 --- /dev/null +++ b/drivers/staging/erofs/unzip_vle_lz4.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/unzip_vle_lz4.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include "unzip_vle.h" + +#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS +#define EROFS_PERCPU_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES +#else +#define EROFS_PERCPU_NR_PAGES Z_EROFS_VLE_INLINE_PAGEVECS +#endif + +static struct { + char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES]; +} erofs_pcpubuf[NR_CPUS]; + +int z_erofs_vle_plain_copy(struct page **compressed_pages, + unsigned clusterpages, + struct page **pages, + unsigned nr_pages, + unsigned short pageofs) +{ + unsigned i, j; + void *src = NULL; + const unsigned righthalf = PAGE_SIZE - pageofs; + char *percpu_data; + bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 }; + + preempt_disable(); + percpu_data = erofs_pcpubuf[smp_processor_id()].data; + + j = 0; + for (i = 0; i < nr_pages; j = i++) { + struct page *page = pages[i]; + void *dst; + + if (page == NULL) { + if (src != NULL) { + if (!mirrored[j]) + kunmap_atomic(src); + src = NULL; + } + continue; + } + + dst = kmap_atomic(page); + + for (; j < clusterpages; ++j) { + if (compressed_pages[j] != page) + continue; + + BUG_ON(mirrored[j]); + memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE); + mirrored[j] = true; + break; + } + + if (i) { + if (src == NULL) + src = mirrored[i-1] ? + percpu_data + (i-1) * PAGE_SIZE : + kmap_atomic(compressed_pages[i-1]); + + memcpy(dst, src + righthalf, pageofs); + + if (!mirrored[i-1]) + kunmap_atomic(src); + + if (unlikely(i >= clusterpages)) { + kunmap_atomic(dst); + break; + } + } + + if (!righthalf) + src = NULL; + else { + src = mirrored[i] ? percpu_data + i * PAGE_SIZE : + kmap_atomic(compressed_pages[i]); + + memcpy(dst + pageofs, src, righthalf); + } + + kunmap_atomic(dst); + } + + if (src != NULL && !mirrored[j]) + kunmap_atomic(src); + + preempt_enable(); + return 0; +} + +extern int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen); + +int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages, + unsigned clusterpages, + struct page **pages, + unsigned outlen, + unsigned short pageofs, + void (*endio)(struct page *)) +{ + void *vin, *vout; + unsigned nr_pages, i, j; + int ret; + + if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE) + return -ENOTSUPP; + + nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE); + + if (clusterpages == 1) + vin = kmap_atomic(compressed_pages[0]); + else + vin = erofs_vmap(compressed_pages, clusterpages); + + preempt_disable(); + vout = erofs_pcpubuf[smp_processor_id()].data; + + ret = z_erofs_unzip_lz4(vin, vout + pageofs, + clusterpages * PAGE_SIZE, outlen); + + if (ret >= 0) { + outlen = ret; + ret = 0; + } + + for (i = 0; i < nr_pages; ++i) { + j = min((unsigned)PAGE_SIZE - pageofs, outlen); + + if (pages[i] != NULL) { + if (ret < 0) + SetPageError(pages[i]); + else if (clusterpages == 1 && pages[i] == compressed_pages[0]) + memcpy(vin + pageofs, vout + pageofs, j); + else { + void *dst = kmap_atomic(pages[i]); + + memcpy(dst + pageofs, vout + pageofs, j); + kunmap_atomic(dst); + } + endio(pages[i]); + } + vout += PAGE_SIZE; + outlen -= j; + pageofs = 0; + } + preempt_enable(); + + if (clusterpages == 1) + kunmap_atomic(vin); + else + erofs_vunmap(vin, clusterpages); + + return ret; +} + +int z_erofs_vle_unzip_vmap(struct page **compressed_pages, + unsigned clusterpages, + void *vout, + unsigned llen, + unsigned short pageofs, + bool overlapped) +{ + void *vin; + unsigned i; + int ret; + + if (overlapped) { + preempt_disable(); + vin = erofs_pcpubuf[smp_processor_id()].data; + + for (i = 0; i < clusterpages; ++i) { + void *t = kmap_atomic(compressed_pages[i]); + + memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE); + kunmap_atomic(t); + } + } else if (clusterpages == 1) + vin = kmap_atomic(compressed_pages[0]); + else { + vin = erofs_vmap(compressed_pages, clusterpages); + } + + ret = z_erofs_unzip_lz4(vin, vout + pageofs, + clusterpages * PAGE_SIZE, llen); + if (ret > 0) + ret = 0; + + if (!overlapped) { + if (clusterpages == 1) + kunmap_atomic(vin); + else { + erofs_vunmap(vin, clusterpages); + } + } else + preempt_enable(); + + return ret; +} + diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c new file mode 100644 index 000000000000..595cf90af9bb --- /dev/null +++ b/drivers/staging/erofs/utils.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/utils.c + * + * Copyright (C) 2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include "internal.h" +#include <linux/pagevec.h> + +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +{ + struct page *page; + + if (!list_empty(pool)) { + page = lru_to_page(pool); + list_del(&page->lru); + } else { + page = alloc_pages(gfp | __GFP_NOFAIL, 0); + + BUG_ON(page == NULL); + BUG_ON(page->mapping != NULL); + } + return page; +} + +/* global shrink count (for all mounted EROFS instances) */ +static atomic_long_t erofs_global_shrink_cnt; + +#ifdef CONFIG_EROFS_FS_ZIP + +/* radix_tree and the future XArray both don't use tagptr_t yet */ +struct erofs_workgroup *erofs_find_workgroup( + struct super_block *sb, pgoff_t index, bool *tag) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_workgroup *grp; + int oldcount; + +repeat: + rcu_read_lock(); + grp = radix_tree_lookup(&sbi->workstn_tree, index); + if (grp != NULL) { + *tag = radix_tree_exceptional_entry(grp); + grp = (void *)((unsigned long)grp & + ~RADIX_TREE_EXCEPTIONAL_ENTRY); + + if (erofs_workgroup_get(grp, &oldcount)) { + /* prefer to relax rcu read side */ + rcu_read_unlock(); + goto repeat; + } + + /* decrease refcount added by erofs_workgroup_put */ + if (unlikely(oldcount == 1)) + atomic_long_dec(&erofs_global_shrink_cnt); + BUG_ON(index != grp->index); + } + rcu_read_unlock(); + return grp; +} + +int erofs_register_workgroup(struct super_block *sb, + struct erofs_workgroup *grp, + bool tag) +{ + struct erofs_sb_info *sbi; + int err; + + /* grp->refcount should not < 1 */ + BUG_ON(!atomic_read(&grp->refcount)); + + err = radix_tree_preload(GFP_NOFS); + if (err) + return err; + + sbi = EROFS_SB(sb); + erofs_workstn_lock(sbi); + + if (tag) + grp = (void *)((unsigned long)grp | + 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT); + + err = radix_tree_insert(&sbi->workstn_tree, + grp->index, grp); + + if (!err) { + __erofs_workgroup_get(grp); + } + + erofs_workstn_unlock(sbi); + radix_tree_preload_end(); + return err; +} + +extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); + +int erofs_workgroup_put(struct erofs_workgroup *grp) +{ + int count = atomic_dec_return(&grp->refcount); + + if (count == 1) + atomic_long_inc(&erofs_global_shrink_cnt); + else if (!count) { + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); + } + return count; +} + +unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, + unsigned long nr_shrink, + bool cleanup) +{ + pgoff_t first_index = 0; + void *batch[PAGEVEC_SIZE]; + unsigned freed = 0; + + int i, found; +repeat: + erofs_workstn_lock(sbi); + + found = radix_tree_gang_lookup(&sbi->workstn_tree, + batch, first_index, PAGEVEC_SIZE); + + for (i = 0; i < found; ++i) { + int cnt; + struct erofs_workgroup *grp = (void *) + ((unsigned long)batch[i] & + ~RADIX_TREE_EXCEPTIONAL_ENTRY); + + first_index = grp->index + 1; + + cnt = atomic_read(&grp->refcount); + BUG_ON(cnt <= 0); + + if (cleanup) + BUG_ON(cnt != 1); + +#ifndef EROFS_FS_HAS_MANAGED_CACHE + else if (cnt > 1) +#else + if (!erofs_workgroup_try_to_freeze(grp, 1)) +#endif + continue; + + if (radix_tree_delete(&sbi->workstn_tree, + grp->index) != grp) { +#ifdef EROFS_FS_HAS_MANAGED_CACHE +skip: + erofs_workgroup_unfreeze(grp, 1); +#endif + continue; + } + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (erofs_try_to_free_all_cached_pages(sbi, grp)) + goto skip; + + erofs_workgroup_unfreeze(grp, 1); +#endif + /* (rarely) grabbed again when freeing */ + erofs_workgroup_put(grp); + + ++freed; + if (unlikely(!--nr_shrink)) + break; + } + erofs_workstn_unlock(sbi); + + if (i && nr_shrink) + goto repeat; + return freed; +} + +#endif + +/* protected by 'erofs_sb_list_lock' */ +static unsigned int shrinker_run_no; + +/* protects the mounted 'erofs_sb_list' */ +static DEFINE_SPINLOCK(erofs_sb_list_lock); +static LIST_HEAD(erofs_sb_list); + +void erofs_register_super(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_init(&sbi->umount_mutex); + + spin_lock(&erofs_sb_list_lock); + list_add(&sbi->list, &erofs_sb_list); + spin_unlock(&erofs_sb_list_lock); +} + +void erofs_unregister_super(struct super_block *sb) +{ + spin_lock(&erofs_sb_list_lock); + list_del(&EROFS_SB(sb)->list); + spin_unlock(&erofs_sb_list_lock); +} + +unsigned long erofs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return atomic_long_read(&erofs_global_shrink_cnt); +} + +unsigned long erofs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct erofs_sb_info *sbi; + struct list_head *p; + + unsigned long nr = sc->nr_to_scan; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&erofs_sb_list_lock); + do + run_no = ++shrinker_run_no; + while (run_no == 0); + + /* Iterate over all mounted superblocks and try to shrink them */ + p = erofs_sb_list.next; + while (p != &erofs_sb_list) { + sbi = list_entry(p, struct erofs_sb_info, list); + + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (sbi->shrinker_run_no == run_no) + break; + + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + + spin_unlock(&erofs_sb_list_lock); + sbi->shrinker_run_no = run_no; + +#ifdef CONFIG_EROFS_FS_ZIP + freed += erofs_shrink_workstation(sbi, nr, false); +#endif + + spin_lock(&erofs_sb_list_lock); + /* Get the next list element before we move this one */ + p = p->next; + + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&sbi->list, &erofs_sb_list); + mutex_unlock(&sbi->umount_mutex); + + if (freed >= nr) + break; + } + spin_unlock(&erofs_sb_list_lock); + return freed; +} + diff --git a/drivers/staging/erofs/xattr.c b/drivers/staging/erofs/xattr.c new file mode 100644 index 000000000000..0e9cfeccdf99 --- /dev/null +++ b/drivers/staging/erofs/xattr.c @@ -0,0 +1,577 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/drivers/staging/erofs/xattr.c + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#include <linux/security.h> +#include "xattr.h" + +struct xattr_iter { + struct super_block *sb; + struct page *page; + void *kaddr; + + erofs_blk_t blkaddr; + unsigned ofs; +}; + +static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) +{ + /* only init_inode_xattrs use non-atomic once */ + if (unlikely(!atomic)) + kunmap(it->page); + else + kunmap_atomic(it->kaddr); + unlock_page(it->page); + put_page(it->page); +} + +static void init_inode_xattrs(struct inode *inode) +{ + struct xattr_iter it; + unsigned i; + struct erofs_xattr_ibody_header *ih; + struct erofs_sb_info *sbi; + struct erofs_vnode *vi; + bool atomic_map; + + if (likely(inode_has_inited_xattr(inode))) + return; + + vi = EROFS_V(inode); + BUG_ON(!vi->xattr_isize); + + sbi = EROFS_I_SB(inode); + it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); + it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + + it.page = erofs_get_inline_page(inode, it.blkaddr); + BUG_ON(IS_ERR(it.page)); + + /* read in shared xattr array (non-atomic, see kmalloc below) */ + it.kaddr = kmap(it.page); + atomic_map = false; + + ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + + vi->xattr_shared_count = ih->h_shared_count; + vi->xattr_shared_xattrs = (unsigned *)kmalloc_array( + vi->xattr_shared_count, sizeof(unsigned), + GFP_KERNEL | __GFP_NOFAIL); + + /* let's skip ibody header */ + it.ofs += sizeof(struct erofs_xattr_ibody_header); + + for (i = 0; i < vi->xattr_shared_count; ++i) { + if (unlikely(it.ofs >= EROFS_BLKSIZ)) { + /* cannot be unaligned */ + BUG_ON(it.ofs != EROFS_BLKSIZ); + xattr_iter_end(&it, atomic_map); + + it.page = erofs_get_meta_page(inode->i_sb, + ++it.blkaddr, S_ISDIR(inode->i_mode)); + BUG_ON(IS_ERR(it.page)); + + it.kaddr = kmap_atomic(it.page); + atomic_map = true; + it.ofs = 0; + } + vi->xattr_shared_xattrs[i] = + le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); + it.ofs += sizeof(__le32); + } + xattr_iter_end(&it, atomic_map); + + inode_set_inited_xattr(inode); +} + +struct xattr_iter_handlers { + int (*entry)(struct xattr_iter *, struct erofs_xattr_entry *); + int (*name)(struct xattr_iter *, unsigned, char *, unsigned); + int (*alloc_buffer)(struct xattr_iter *, unsigned); + void (*value)(struct xattr_iter *, unsigned, char *, unsigned); +}; + +static void xattr_iter_fixup(struct xattr_iter *it) +{ + if (unlikely(it->ofs >= EROFS_BLKSIZ)) { + xattr_iter_end(it, true); + + it->blkaddr += erofs_blknr(it->ofs); + it->page = erofs_get_meta_page(it->sb, it->blkaddr, false); + BUG_ON(IS_ERR(it->page)); + + it->kaddr = kmap_atomic(it->page); + it->ofs = erofs_blkoff(it->ofs); + } +} + +static int inline_xattr_iter_begin(struct xattr_iter *it, + struct inode *inode) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); + unsigned xattr_header_sz, inline_xattr_ofs; + + xattr_header_sz = inlinexattr_header_size(inode); + if (unlikely(xattr_header_sz >= vi->xattr_isize)) { + BUG_ON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + inline_xattr_ofs = vi->inode_isize + xattr_header_sz; + + it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); + it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); + + it->page = erofs_get_inline_page(inode, it->blkaddr); + BUG_ON(IS_ERR(it->page)); + it->kaddr = kmap_atomic(it->page); + + return vi->xattr_isize - xattr_header_sz; +} + +static int xattr_foreach(struct xattr_iter *it, + struct xattr_iter_handlers *op, unsigned *tlimit) +{ + struct erofs_xattr_entry entry; + unsigned value_sz, processed, slice; + int err; + + /* 0. fixup blkaddr, ofs, ipage */ + xattr_iter_fixup(it); + + /* + * 1. read xattr entry to the memory, + * since we do EROFS_XATTR_ALIGN + * therefore entry should be in the page + */ + entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); + if (tlimit != NULL) { + unsigned entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry); + + BUG_ON(*tlimit < entry_sz); + *tlimit -= entry_sz; + } + + it->ofs += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); + + /* handle entry */ + err = op->entry(it, &entry); + if (err) { + it->ofs += entry.e_name_len + value_sz; + goto out; + } + + /* 2. handle xattr name (ofs will finally be at the end of name) */ + processed = 0; + + while (processed < entry.e_name_len) { + if (it->ofs >= EROFS_BLKSIZ) { + BUG_ON(it->ofs > EROFS_BLKSIZ); + + xattr_iter_fixup(it); + it->ofs = 0; + } + + slice = min_t(unsigned, PAGE_SIZE - it->ofs, + entry.e_name_len - processed); + + /* handle name */ + err = op->name(it, processed, it->kaddr + it->ofs, slice); + if (err) { + it->ofs += entry.e_name_len - processed + value_sz; + goto out; + } + + it->ofs += slice; + processed += slice; + } + + /* 3. handle xattr value */ + processed = 0; + + if (op->alloc_buffer != NULL) { + err = op->alloc_buffer(it, value_sz); + if (err) { + it->ofs += value_sz; + goto out; + } + } + + while (processed < value_sz) { + if (it->ofs >= EROFS_BLKSIZ) { + BUG_ON(it->ofs > EROFS_BLKSIZ); + xattr_iter_fixup(it); + it->ofs = 0; + } + + slice = min_t(unsigned, PAGE_SIZE - it->ofs, + value_sz - processed); + op->value(it, processed, it->kaddr + it->ofs, slice); + it->ofs += slice; + processed += slice; + } + +out: + /* we assume that ofs is aligned with 4 bytes */ + it->ofs = EROFS_XATTR_ALIGN(it->ofs); + return err; +} + +struct getxattr_iter { + struct xattr_iter it; + + char *buffer; + int buffer_size, index; + struct qstr name; +}; + +static int xattr_entrymatch(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) ? -ENOATTR : 0; +} + +static int xattr_namematch(struct xattr_iter *_it, + unsigned processed, char *buf, unsigned len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; +} + +static int xattr_checkbuffer(struct xattr_iter *_it, + unsigned value_sz) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + int err = it->buffer_size < value_sz ? -ERANGE : 0; + + it->buffer_size = value_sz; + return it->buffer == NULL ? 1 : err; +} + +static void xattr_copyvalue(struct xattr_iter *_it, + unsigned processed, char *buf, unsigned len) +{ + struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); + + memcpy(it->buffer + processed, buf, len); +} + +static struct xattr_iter_handlers find_xattr_handlers = { + .entry = xattr_entrymatch, + .name = xattr_namematch, + .alloc_buffer = xattr_checkbuffer, + .value = xattr_copyvalue +}; + +static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + int ret; + unsigned remaining; + + ret = inline_xattr_iter_begin(&it->it, inode); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); + if (ret >= 0) + break; + } + xattr_iter_end(&it->it, true); + + return ret < 0 ? ret : it->buffer_size; +} + +static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); + unsigned i; + int ret = -ENOATTR; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(inode->i_sb, + blkaddr, false); + BUG_ON(IS_ERR(it->it.page)); + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); + if (ret >= 0) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end(&it->it, true); + + return ret < 0 ? ret : it->buffer_size; +} + +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +int erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct getxattr_iter it; + + if (unlikely(name == NULL)) + return -EINVAL; + + init_inode_xattrs(inode); + + it.index = index; + + it.name.len = strlen(name); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + it.name.name = name; + + it.buffer = buffer; + it.buffer_size = buffer_size; + + it.it.sb = inode->i_sb; + ret = inline_getxattr(inode, &it); + if (ret == -ENOATTR) + ret = shared_getxattr(inode, &it); + return ret; +} + +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + + switch (handler->flags) { + case EROFS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case EROFS_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + case EROFS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + + if (!vi->xattr_isize) + return -ENOATTR; + + return erofs_getxattr(inode, handler->flags, name, buffer, size); +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, + .list = erofs_xattr_user_list, + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, + .list = erofs_xattr_trusted_list, + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler *erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + +struct listxattr_iter { + struct xattr_iter it; + + struct dentry *dentry; + char *buffer; + int buffer_size, buffer_ofs; +}; + +static int xattr_entrylist(struct xattr_iter *_it, + struct erofs_xattr_entry *entry) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + unsigned prefix_len; + const char *prefix; + + const struct xattr_handler *h = + erofs_xattr_handler(entry->e_name_index); + + if (h == NULL || (h->list != NULL && !h->list(it->dentry))) + return 1; + + /* Note that at least one of 'prefix' and 'name' should be non-NULL */ + prefix = h->prefix != NULL ? h->prefix : h->name; + prefix_len = strlen(prefix); + + if (it->buffer == NULL) { + it->buffer_ofs += prefix_len + entry->e_name_len + 1; + return 1; + } + + if (it->buffer_ofs + prefix_len + + entry->e_name_len + 1 > it->buffer_size) + return -ERANGE; + + memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); + it->buffer_ofs += prefix_len; + return 0; +} + +static int xattr_namelist(struct xattr_iter *_it, + unsigned processed, char *buf, unsigned len) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + memcpy(it->buffer + it->buffer_ofs, buf, len); + it->buffer_ofs += len; + return 0; +} + +static int xattr_skipvalue(struct xattr_iter *_it, + unsigned value_sz) +{ + struct listxattr_iter *it = + container_of(_it, struct listxattr_iter, it); + + it->buffer[it->buffer_ofs++] = '\0'; + return 1; +} + +static struct xattr_iter_handlers list_xattr_handlers = { + .entry = xattr_entrylist, + .name = xattr_namelist, + .alloc_buffer = xattr_skipvalue, + .value = NULL +}; + +static int inline_listxattr(struct listxattr_iter *it) +{ + int ret; + unsigned remaining; + + ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); + if (ret < 0) + return ret; + + remaining = ret; + while (remaining) { + ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); + if (ret < 0) + break; + } + xattr_iter_end(&it->it, true); + return ret < 0 ? ret : it->buffer_ofs; +} + +static int shared_listxattr(struct listxattr_iter *it) +{ + struct inode *const inode = d_inode(it->dentry); + struct erofs_vnode *const vi = EROFS_V(inode); + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); + unsigned i; + int ret = 0; + + for (i = 0; i < vi->xattr_shared_count; ++i) { + erofs_blk_t blkaddr = + xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); + + it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); + if (!i || blkaddr != it->it.blkaddr) { + if (i) + xattr_iter_end(&it->it, true); + + it->it.page = erofs_get_meta_page(inode->i_sb, + blkaddr, false); + BUG_ON(IS_ERR(it->it.page)); + it->it.kaddr = kmap_atomic(it->it.page); + it->it.blkaddr = blkaddr; + } + + ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); + if (ret < 0) + break; + } + if (vi->xattr_shared_count) + xattr_iter_end(&it->it, true); + + return ret < 0 ? ret : it->buffer_ofs; +} + +ssize_t erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + int ret; + struct listxattr_iter it; + + init_inode_xattrs(d_inode(dentry)); + + it.dentry = dentry; + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + it.it.sb = dentry->d_sb; + + ret = inline_listxattr(&it); + if (ret < 0 && ret != -ENOATTR) + return ret; + return shared_listxattr(&it); +} + diff --git a/drivers/staging/erofs/xattr.h b/drivers/staging/erofs/xattr.h new file mode 100644 index 000000000000..0c7379282fc5 --- /dev/null +++ b/drivers/staging/erofs/xattr.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * linux/drivers/staging/erofs/xattr.h + * + * Copyright (C) 2017-2018 HUAWEI, Inc. + * http://www.huawei.com/ + * Created by Gao Xiang <gaoxiang25@huawei.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ +#ifndef __EROFS_XATTR_H +#define __EROFS_XATTR_H + +#include "internal.h" +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +/* Attribute not found */ +#define ENOATTR ENODATA + +static inline unsigned inlinexattr_header_size(struct inode *inode) +{ + return sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * EROFS_V(inode)->xattr_shared_count; +} + +static inline erofs_blk_t +xattrblock_addr(struct erofs_sb_info *sbi, unsigned xattr_id) +{ +#ifdef CONFIG_EROFS_FS_XATTR + return sbi->xattr_blkaddr + + xattr_id * sizeof(__u32) / EROFS_BLKSIZ; +#else + return 0; +#endif +} + +static inline unsigned +xattrblock_offset(struct erofs_sb_info *sbi, unsigned xattr_id) +{ + return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; +} + +extern const struct xattr_handler erofs_xattr_user_handler; +extern const struct xattr_handler erofs_xattr_trusted_handler; +#ifdef CONFIG_EROFS_FS_SECURITY +extern const struct xattr_handler erofs_xattr_security_handler; +#endif + +static inline const struct xattr_handler *erofs_xattr_handler(unsigned index) +{ +static const struct xattr_handler *xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif +}; + return index && index < ARRAY_SIZE(xattr_handler_map) ? + xattr_handler_map[index] : NULL; +} + +#ifdef CONFIG_EROFS_FS_XATTR + +extern const struct inode_operations erofs_generic_xattr_iops; +extern const struct inode_operations erofs_dir_xattr_iops; + +int erofs_getxattr(struct inode *, int, const char *, void *, size_t); +ssize_t erofs_listxattr(struct dentry *, char *, size_t); +#else +static int __maybe_unused erofs_getxattr(struct inode *inode, int index, + const char *name, + void *buffer, size_t buffer_size) +{ + return -ENOTSUPP; +} + +static ssize_t __maybe_unused erofs_listxattr(struct dentry *dentry, + char *buffer, size_t buffer_size) +{ + return -ENOTSUPP; +} +#endif + +#endif + |