summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile2
-rw-r--r--fs/btrfs/extent_io.c35
-rw-r--r--fs/btrfs/tree-log.c16
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/smbencrypt.c18
-rw-r--r--fs/coda/pioctl.c7
-rw-r--r--fs/configfs/configfs_internal.h15
-rw-r--r--fs/configfs/dir.c137
-rw-r--r--fs/configfs/file.c280
-rw-r--r--fs/crypto/Kconfig2
-rw-r--r--fs/crypto/Makefile10
-rw-r--r--fs/crypto/crypto.c45
-rw-r--r--fs/crypto/fname.c47
-rw-r--r--fs/crypto/fscrypt_private.h399
-rw-r--r--fs/crypto/hkdf.c181
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/keyinfo.c611
-rw-r--r--fs/crypto/keyring.c984
-rw-r--r--fs/crypto/keysetup.c591
-rw-r--r--fs/crypto/keysetup_v1.c340
-rw-r--r--fs/crypto/policy.c434
-rw-r--r--fs/d_path.c6
-rw-r--r--fs/devpts/inode.c2
-rw-r--r--fs/erofs/Kconfig91
-rw-r--r--fs/erofs/Makefile11
-rw-r--r--fs/erofs/compress.h60
-rw-r--r--fs/erofs/data.c360
-rw-r--r--fs/erofs/decompressor.c338
-rw-r--r--fs/erofs/dir.c142
-rw-r--r--fs/erofs/erofs_fs.h316
-rw-r--r--fs/erofs/inode.c337
-rw-r--r--fs/erofs/internal.h431
-rw-r--r--fs/erofs/namei.c252
-rw-r--r--fs/erofs/super.c615
-rw-r--r--fs/erofs/tagptr.h110
-rw-r--r--fs/erofs/utils.c333
-rw-r--r--fs/erofs/xattr.c704
-rw-r--r--fs/erofs/xattr.h92
-rw-r--r--fs/erofs/zdata.c1431
-rw-r--r--fs/erofs/zdata.h193
-rw-r--r--fs/erofs/zmap.c471
-rw-r--r--fs/erofs/zpvec.h157
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/ext4.h23
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/inode.c58
-rw-r--r--fs/ext4/ioctl.c45
-rw-r--r--fs/ext4/readpage.c211
-rw-r--r--fs/ext4/super.c21
-rw-r--r--fs/ext4/sysfs.c6
-rw-r--r--fs/ext4/verity.c367
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/data.c75
-rw-r--r--fs/f2fs/f2fs.h20
-rw-r--r--fs/f2fs/file.c101
-rw-r--r--fs/f2fs/inode.c5
-rw-r--r--fs/f2fs/super.c5
-rw-r--r--fs/f2fs/sysfs.c11
-rw-r--r--fs/f2fs/verity.c247
-rw-r--r--fs/f2fs/xattr.h2
-rw-r--r--fs/fs-writeback.c174
-rw-r--r--fs/fs_context.c4
-rw-r--r--fs/io_uring.c531
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/jfs/Kconfig2
-rw-r--r--fs/kernfs/dir.c9
-rw-r--r--fs/locks.c11
-rw-r--r--fs/namei.c64
-rw-r--r--fs/namespace.c21
-rw-r--r--fs/nfs/inode.c18
-rw-r--r--fs/nfs/nfstrace.h2
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/orangefs/file.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/super.c106
-rw-r--r--fs/timerfd.c6
-rw-r--r--fs/ubifs/ioctl.c20
-rw-r--r--fs/ubifs/super.c11
-rw-r--r--fs/ufs/Kconfig2
-rw-r--r--fs/verity/Kconfig55
-rw-r--r--fs/verity/Makefile10
-rw-r--r--fs/verity/enable.c377
-rw-r--r--fs/verity/fsverity_private.h185
-rw-r--r--fs/verity/hash_algs.c280
-rw-r--r--fs/verity/init.c61
-rw-r--r--fs/verity/measure.c57
-rw-r--r--fs/verity/open.c356
-rw-r--r--fs/verity/signature.c157
-rw-r--r--fs/verity/verify.c281
-rw-r--r--fs/xfs/xfs_ioctl.c2
96 files changed, 13233 insertions, 1410 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index bfb1c6095c7a..2501e6f1f965 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -112,6 +112,8 @@ config MANDATORY_FILE_LOCKING
source "fs/crypto/Kconfig"
+source "fs/verity/Kconfig"
+
source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
@@ -261,6 +263,7 @@ source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
+source "fs/erofs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index d60089fd689b..14231b4cf383 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
+obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -130,3 +131,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
+obj-$(CONFIG_EROFS_FS) += erofs/
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1ff438fd5bc2..eeb75281894e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3628,6 +3628,13 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
}
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
/*
* Lock eb pages and flush the bio if we can't the locks
*
@@ -3699,8 +3706,11 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- ret = flush_write_bio(epd);
- if (ret < 0) {
+ int err;
+
+ err = flush_write_bio(epd);
+ if (err < 0) {
+ ret = err;
failed_page_nr = i;
goto err_unlock;
}
@@ -3715,16 +3725,23 @@ err_unlock:
/* Unlock already locked pages */
for (i = 0; i < failed_page_nr; i++)
unlock_page(eb->pages[i]);
+ /*
+ * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
+ * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
+ * be made and undo everything done before.
+ */
+ btrfs_tree_lock(eb);
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ end_extent_buffer_writeback(eb);
+ spin_unlock(&eb->refs_lock);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
+ fs_info->dirty_metadata_batch);
+ btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ btrfs_tree_unlock(eb);
return ret;
}
-static void end_extent_buffer_writeback(struct extent_buffer *eb)
-{
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
-}
-
static void set_btree_ioerr(struct page *page)
{
struct extent_buffer *eb = (struct extent_buffer *)page->private;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6c8297bcfeb7..1bfd7e34f31e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4985,7 +4985,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
0, LLONG_MAX, ctx);
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
}
continue;
@@ -5000,7 +5000,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
if (ret) {
- iput(inode);
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5009,7 +5009,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- iput(inode);
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5056,7 +5056,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
}
path->slots[0]++;
}
- iput(inode);
+ btrfs_add_delayed_iput(inode);
}
return ret;
@@ -5689,7 +5689,7 @@ process_leaf:
}
if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5701,7 +5701,7 @@ process_leaf:
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
@@ -5848,7 +5848,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
- iput(dir_inode);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -5891,7 +5891,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_INODE_EXISTS,
0, LLONG_MAX, ctx);
- iput(inode);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index b16219e5dac9..350bc3061656 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -16,7 +16,7 @@ config CIFS
select CRYPTO_GCM
select CRYPTO_ECB
select CRYPTO_AES
- select CRYPTO_DES
+ select CRYPTO_LIB_DES
select KEYS
help
This is the client VFS module for the SMB3 family of NAS protocols,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3289b566463f..4e2f74894e9b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1601,7 +1601,6 @@ MODULE_DESCRIPTION
("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
"also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
-MODULE_SOFTDEP("pre: des");
MODULE_SOFTDEP("pre: ecb");
MODULE_SOFTDEP("pre: hmac");
MODULE_SOFTDEP("pre: md4");
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index ce8b7f677c58..eb0bb8ca8e63 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -24,7 +24,7 @@
*/
/*
- * See Documentation/filesystems/nfs/Exporting
+ * See Documentation/filesystems/nfs/exporting.rst
* and examples in fs/exportfs
*
* Since cifs is a network file system, an "fsid" must be included for
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 2b6d87bfdf8e..39a938443e3e 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -11,13 +11,14 @@
*/
-#include <linux/crypto.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/fips.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/random.h>
+#include <crypto/des.h>
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
#include "cifspdu.h"
@@ -58,19 +59,18 @@ static int
smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
{
unsigned char key2[8];
- struct crypto_cipher *tfm_des;
+ struct des_ctx ctx;
str_to_key(key, key2);
- tfm_des = crypto_alloc_cipher("des", 0, 0);
- if (IS_ERR(tfm_des)) {
- cifs_dbg(VFS, "could not allocate des crypto API\n");
- return PTR_ERR(tfm_des);
+ if (fips_enabled) {
+ cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n");
+ return -ENOENT;
}
- crypto_cipher_setkey(tfm_des, key2, 8);
- crypto_cipher_encrypt_one(tfm_des, out, in);
- crypto_free_cipher(tfm_des);
+ des_expand_key(&ctx, key2, DES_KEY_SIZE);
+ des_encrypt(&ctx, out, in);
+ memzero_explicit(&ctx, sizeof(ctx));
return 0;
}
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 644d48c12ce8..3aec27e5eb82 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -63,11 +63,8 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
* Look up the pathname. Note that the pathname is in
* user memory, and namei takes care of this
*/
- if (data.follow)
- error = user_path(data.path, &path);
- else
- error = user_lpath(data.path, &path);
-
+ error = user_path_at(AT_FDCWD, data.path,
+ data.follow ? LOOKUP_FOLLOW : 0, &path);
if (error)
return error;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index f752d83a9c44..520f1813e789 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -20,6 +20,15 @@
#include <linux/list.h>
#include <linux/spinlock.h>
+struct configfs_fragment {
+ atomic_t frag_count;
+ struct rw_semaphore frag_sem;
+ bool frag_dead;
+};
+
+void put_fragment(struct configfs_fragment *);
+struct configfs_fragment *get_fragment(struct configfs_fragment *);
+
struct configfs_dirent {
atomic_t s_count;
int s_dependent_count;
@@ -34,6 +43,7 @@ struct configfs_dirent {
#ifdef CONFIG_LOCKDEP
int s_depth;
#endif
+ struct configfs_fragment *s_frag;
};
#define CONFIGFS_ROOT 0x0001
@@ -61,8 +71,8 @@ extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct in
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_create_bin_file(struct config_item *,
const struct configfs_bin_attribute *);
-extern int configfs_make_dirent(struct configfs_dirent *,
- struct dentry *, void *, umode_t, int);
+extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *,
+ void *, umode_t, int, struct configfs_fragment *);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
@@ -137,6 +147,7 @@ static inline void release_configfs_dirent(struct configfs_dirent * sd)
{
if (!(sd->s_type & CONFIGFS_ROOT)) {
kfree(sd->s_iattr);
+ put_fragment(sd->s_frag);
kmem_cache_free(configfs_dir_cachep, sd);
}
}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 92112915de8e..79fc25aaa8cd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -151,11 +151,38 @@ configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
#endif /* CONFIG_LOCKDEP */
+static struct configfs_fragment *new_fragment(void)
+{
+ struct configfs_fragment *p;
+
+ p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL);
+ if (p) {
+ atomic_set(&p->frag_count, 1);
+ init_rwsem(&p->frag_sem);
+ p->frag_dead = false;
+ }
+ return p;
+}
+
+void put_fragment(struct configfs_fragment *frag)
+{
+ if (frag && atomic_dec_and_test(&frag->frag_count))
+ kfree(frag);
+}
+
+struct configfs_fragment *get_fragment(struct configfs_fragment *frag)
+{
+ if (likely(frag))
+ atomic_inc(&frag->frag_count);
+ return frag;
+}
+
/*
* Allocates a new configfs_dirent and links it to the parent configfs_dirent
*/
static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
- void *element, int type)
+ void *element, int type,
+ struct configfs_fragment *frag)
{
struct configfs_dirent * sd;
@@ -175,6 +202,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren
kmem_cache_free(configfs_dir_cachep, sd);
return ERR_PTR(-ENOENT);
}
+ sd->s_frag = get_fragment(frag);
list_add(&sd->s_sibling, &parent_sd->s_children);
spin_unlock(&configfs_dirent_lock);
@@ -209,11 +237,11 @@ static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
int configfs_make_dirent(struct configfs_dirent * parent_sd,
struct dentry * dentry, void * element,
- umode_t mode, int type)
+ umode_t mode, int type, struct configfs_fragment *frag)
{
struct configfs_dirent * sd;
- sd = configfs_new_dirent(parent_sd, element, type);
+ sd = configfs_new_dirent(parent_sd, element, type, frag);
if (IS_ERR(sd))
return PTR_ERR(sd);
@@ -260,7 +288,8 @@ static void init_symlink(struct inode * inode)
* until it is validated by configfs_dir_set_ready()
*/
-static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
+static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
+ struct configfs_fragment *frag)
{
int error;
umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
@@ -273,7 +302,8 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
return error;
error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
- CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+ CONFIGFS_DIR | CONFIGFS_USET_CREATING,
+ frag);
if (unlikely(error))
return error;
@@ -338,9 +368,10 @@ int configfs_create_link(struct configfs_symlink *sl,
{
int err = 0;
umode_t mode = S_IFLNK | S_IRWXUGO;
+ struct configfs_dirent *p = parent->d_fsdata;
- err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
- CONFIGFS_ITEM_LINK);
+ err = configfs_make_dirent(p, dentry, sl, mode,
+ CONFIGFS_ITEM_LINK, p->s_frag);
if (!err) {
err = configfs_create(dentry, mode, init_symlink);
if (err) {
@@ -599,7 +630,8 @@ static int populate_attrs(struct config_item *item)
static int configfs_attach_group(struct config_item *parent_item,
struct config_item *item,
- struct dentry *dentry);
+ struct dentry *dentry,
+ struct configfs_fragment *frag);
static void configfs_detach_group(struct config_item *item);
static void detach_groups(struct config_group *group)
@@ -647,7 +679,8 @@ static void detach_groups(struct config_group *group)
* try using vfs_mkdir. Just a thought.
*/
static int create_default_group(struct config_group *parent_group,
- struct config_group *group)
+ struct config_group *group,
+ struct configfs_fragment *frag)
{
int ret;
struct configfs_dirent *sd;
@@ -663,7 +696,7 @@ static int create_default_group(struct config_group *parent_group,
d_add(child, NULL);
ret = configfs_attach_group(&parent_group->cg_item,
- &group->cg_item, child);
+ &group->cg_item, child, frag);
if (!ret) {
sd = child->d_fsdata;
sd->s_type |= CONFIGFS_USET_DEFAULT;
@@ -677,13 +710,14 @@ static int create_default_group(struct config_group *parent_group,
return ret;
}
-static int populate_groups(struct config_group *group)
+static int populate_groups(struct config_group *group,
+ struct configfs_fragment *frag)
{
struct config_group *new_group;
int ret = 0;
list_for_each_entry(new_group, &group->default_groups, group_entry) {
- ret = create_default_group(group, new_group);
+ ret = create_default_group(group, new_group, frag);
if (ret) {
detach_groups(group);
break;
@@ -797,11 +831,12 @@ static void link_group(struct config_group *parent_group, struct config_group *g
*/
static int configfs_attach_item(struct config_item *parent_item,
struct config_item *item,
- struct dentry *dentry)
+ struct dentry *dentry,
+ struct configfs_fragment *frag)
{
int ret;
- ret = configfs_create_dir(item, dentry);
+ ret = configfs_create_dir(item, dentry, frag);
if (!ret) {
ret = populate_attrs(item);
if (ret) {
@@ -831,12 +866,13 @@ static void configfs_detach_item(struct config_item *item)
static int configfs_attach_group(struct config_item *parent_item,
struct config_item *item,
- struct dentry *dentry)
+ struct dentry *dentry,
+ struct configfs_fragment *frag)
{
int ret;
struct configfs_dirent *sd;
- ret = configfs_attach_item(parent_item, item, dentry);
+ ret = configfs_attach_item(parent_item, item, dentry, frag);
if (!ret) {
sd = dentry->d_fsdata;
sd->s_type |= CONFIGFS_USET_DIR;
@@ -852,7 +888,7 @@ static int configfs_attach_group(struct config_item *parent_item,
*/
inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
configfs_adjust_dir_dirent_depth_before_populate(sd);
- ret = populate_groups(to_config_group(item));
+ ret = populate_groups(to_config_group(item), frag);
if (ret) {
configfs_detach_item(item);
d_inode(dentry)->i_flags |= S_DEAD;
@@ -1247,6 +1283,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct configfs_dirent *sd;
const struct config_item_type *type;
struct module *subsys_owner = NULL, *new_item_owner = NULL;
+ struct configfs_fragment *frag;
char *name;
sd = dentry->d_parent->d_fsdata;
@@ -1265,6 +1302,12 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
goto out;
}
+ frag = new_fragment();
+ if (!frag) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/* Get a working ref for the duration of this function */
parent_item = configfs_get_config_item(dentry->d_parent);
type = parent_item->ci_type;
@@ -1367,9 +1410,9 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
spin_unlock(&configfs_dirent_lock);
if (group)
- ret = configfs_attach_group(parent_item, item, dentry);
+ ret = configfs_attach_group(parent_item, item, dentry, frag);
else
- ret = configfs_attach_item(parent_item, item, dentry);
+ ret = configfs_attach_item(parent_item, item, dentry, frag);
spin_lock(&configfs_dirent_lock);
sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
@@ -1406,6 +1449,7 @@ out_put:
* reference.
*/
config_item_put(parent_item);
+ put_fragment(frag);
out:
return ret;
@@ -1417,6 +1461,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct config_item *item;
struct configfs_subsystem *subsys;
struct configfs_dirent *sd;
+ struct configfs_fragment *frag;
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
@@ -1474,6 +1519,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
}
} while (ret == -EAGAIN);
+ frag = sd->s_frag;
+ if (down_write_killable(&frag->frag_sem)) {
+ spin_lock(&configfs_dirent_lock);
+ configfs_detach_rollback(dentry);
+ spin_unlock(&configfs_dirent_lock);
+ return -EINTR;
+ }
+ frag->frag_dead = true;
+ up_write(&frag->frag_sem);
+
/* Get a working ref for the duration of this function */
item = configfs_get_config_item(dentry);
@@ -1574,7 +1629,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
*/
err = -ENOENT;
if (configfs_dirent_is_ready(parent_sd)) {
- file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
+ file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL);
if (IS_ERR(file->private_data))
err = PTR_ERR(file->private_data);
else
@@ -1732,8 +1787,13 @@ int configfs_register_group(struct config_group *parent_group,
{
struct configfs_subsystem *subsys = parent_group->cg_subsys;
struct dentry *parent;
+ struct configfs_fragment *frag;
int ret;
+ frag = new_fragment();
+ if (!frag)
+ return -ENOMEM;
+
mutex_lock(&subsys->su_mutex);
link_group(parent_group, group);
mutex_unlock(&subsys->su_mutex);
@@ -1741,7 +1801,7 @@ int configfs_register_group(struct config_group *parent_group,
parent = parent_group->cg_item.ci_dentry;
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
- ret = create_default_group(parent_group, group);
+ ret = create_default_group(parent_group, group, frag);
if (ret)
goto err_out;
@@ -1749,12 +1809,14 @@ int configfs_register_group(struct config_group *parent_group,
configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
spin_unlock(&configfs_dirent_lock);
inode_unlock(d_inode(parent));
+ put_fragment(frag);
return 0;
err_out:
inode_unlock(d_inode(parent));
mutex_lock(&subsys->su_mutex);
unlink_group(group);
mutex_unlock(&subsys->su_mutex);
+ put_fragment(frag);
return ret;
}
EXPORT_SYMBOL(configfs_register_group);
@@ -1770,16 +1832,12 @@ void configfs_unregister_group(struct config_group *group)
struct configfs_subsystem *subsys = group->cg_subsys;
struct dentry *dentry = group->cg_item.ci_dentry;
struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+ struct configfs_dirent *sd = dentry->d_fsdata;
+ struct configfs_fragment *frag = sd->s_frag;
- mutex_lock(&subsys->su_mutex);
- if (!group->cg_item.ci_parent->ci_group) {
- /*
- * The parent has already been unlinked and detached
- * due to a rmdir.
- */
- goto unlink_group;
- }
- mutex_unlock(&subsys->su_mutex);
+ down_write(&frag->frag_sem);
+ frag->frag_dead = true;
+ up_write(&frag->frag_sem);
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
spin_lock(&configfs_dirent_lock);
@@ -1796,7 +1854,6 @@ void configfs_unregister_group(struct config_group *group)
dput(dentry);
mutex_lock(&subsys->su_mutex);
-unlink_group:
unlink_group(group);
mutex_unlock(&subsys->su_mutex);
}
@@ -1853,10 +1910,17 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
struct dentry *dentry;
struct dentry *root;
struct configfs_dirent *sd;
+ struct configfs_fragment *frag;
+
+ frag = new_fragment();
+ if (!frag)
+ return -ENOMEM;
root = configfs_pin_fs();
- if (IS_ERR(root))
+ if (IS_ERR(root)) {
+ put_fragment(frag);
return PTR_ERR(root);
+ }
if (!group->cg_item.ci_name)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
@@ -1872,7 +1936,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
d_add(dentry, NULL);
err = configfs_attach_group(sd->s_element, &group->cg_item,
- dentry);
+ dentry, frag);
if (err) {
BUG_ON(d_inode(dentry));
d_drop(dentry);
@@ -1890,6 +1954,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
unlink_group(group);
configfs_release_fs();
}
+ put_fragment(frag);
return err;
}
@@ -1899,12 +1964,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
struct config_group *group = &subsys->su_group;
struct dentry *dentry = group->cg_item.ci_dentry;
struct dentry *root = dentry->d_sb->s_root;
+ struct configfs_dirent *sd = dentry->d_fsdata;
+ struct configfs_fragment *frag = sd->s_frag;
if (dentry->d_parent != root) {
pr_err("Tried to unregister non-subsystem!\n");
return;
}
+ down_write(&frag->frag_sem);
+ frag->frag_dead = true;
+ up_write(&frag->frag_sem);
+
inode_lock_nested(d_inode(root),
I_MUTEX_PARENT);
inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 61e4db4390a1..fb65b706cc0d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -39,40 +39,44 @@ struct configfs_buffer {
bool write_in_progress;
char *bin_buffer;
int bin_buffer_size;
+ int cb_max_size;
+ struct config_item *item;
+ struct module *owner;
+ union {
+ struct configfs_attribute *attr;
+ struct configfs_bin_attribute *bin_attr;
+ };
};
+static inline struct configfs_fragment *to_frag(struct file *file)
+{
+ struct configfs_dirent *sd = file->f_path.dentry->d_fsdata;
-/**
- * fill_read_buffer - allocate and fill buffer from item.
- * @dentry: dentry pointer.
- * @buffer: data buffer for file.
- *
- * Allocate @buffer->page, if it hasn't been already, then call the
- * config_item's show() method to fill the buffer with this attribute's
- * data.
- * This is called only once, on the file's first read.
- */
-static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+ return sd->s_frag;
+}
+
+static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer)
{
- struct configfs_attribute * attr = to_attr(dentry);
- struct config_item * item = to_item(dentry->d_parent);
- int ret = 0;
- ssize_t count;
+ struct configfs_fragment *frag = to_frag(file);
+ ssize_t count = -ENOENT;
if (!buffer->page)
buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
if (!buffer->page)
return -ENOMEM;
- count = attr->show(item, buffer->page);
-
- BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
- if (count >= 0) {
- buffer->needs_read_fill = 0;
- buffer->count = count;
- } else
- ret = count;
- return ret;
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ count = buffer->attr->show(buffer->item, buffer->page);
+ up_read(&frag->frag_sem);
+
+ if (count < 0)
+ return count;
+ if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE))
+ return -EIO;
+ buffer->needs_read_fill = 0;
+ buffer->count = count;
+ return 0;
}
/**
@@ -97,12 +101,13 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
static ssize_t
configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- struct configfs_buffer * buffer = file->private_data;
+ struct configfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
mutex_lock(&buffer->mutex);
if (buffer->needs_read_fill) {
- if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
+ retval = fill_read_buffer(file, buffer);
+ if (retval)
goto out;
}
pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
@@ -138,10 +143,8 @@ static ssize_t
configfs_read_bin_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct configfs_fragment *frag = to_frag(file);
struct configfs_buffer *buffer = file->private_data;
- struct dentry *dentry = file->f_path.dentry;
- struct config_item *item = to_item(dentry->d_parent);
- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
ssize_t retval = 0;
ssize_t len = min_t(size_t, count, PAGE_SIZE);
@@ -156,14 +159,19 @@ configfs_read_bin_file(struct file *file, char __user *buf,
if (buffer->needs_read_fill) {
/* perform first read with buf == NULL to get extent */
- len = bin_attr->read(item, NULL, 0);
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ len = buffer->bin_attr->read(buffer->item, NULL, 0);
+ else
+ len = -ENOENT;
+ up_read(&frag->frag_sem);
if (len <= 0) {
retval = len;
goto out;
}
/* do not exceed the maximum value */
- if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+ if (buffer->cb_max_size && len > buffer->cb_max_size) {
retval = -EFBIG;
goto out;
}
@@ -176,7 +184,13 @@ configfs_read_bin_file(struct file *file, char __user *buf,
buffer->bin_buffer_size = len;
/* perform second read to fill buffer */
- len = bin_attr->read(item, buffer->bin_buffer, len);
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ len = buffer->bin_attr->read(buffer->item,
+ buffer->bin_buffer, len);
+ else
+ len = -ENOENT;
+ up_read(&frag->frag_sem);
if (len < 0) {
retval = len;
vfree(buffer->bin_buffer);
@@ -226,25 +240,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
return error ? -EFAULT : count;
}
-
-/**
- * flush_write_buffer - push buffer to config_item.
- * @dentry: dentry to the attribute
- * @buffer: data buffer for file.
- * @count: number of bytes
- *
- * Get the correct pointers for the config_item and the attribute we're
- * dealing with, then call the store() method for the attribute,
- * passing the buffer that we acquired in fill_write_buffer().
- */
-
static int
-flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count)
{
- struct configfs_attribute * attr = to_attr(dentry);
- struct config_item * item = to_item(dentry->d_parent);
-
- return attr->store(item, buffer->page, count);
+ struct configfs_fragment *frag = to_frag(file);
+ int res = -ENOENT;
+
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead)
+ res = buffer->attr->store(buffer->item, buffer->page, count);
+ up_read(&frag->frag_sem);
+ return res;
}
@@ -268,13 +274,13 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
static ssize_t
configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct configfs_buffer * buffer = file->private_data;
+ struct configfs_buffer *buffer = file->private_data;
ssize_t len;
mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
- len = flush_write_buffer(file->f_path.dentry, buffer, len);
+ len = flush_write_buffer(file, buffer, len);
if (len > 0)
*ppos += len;
mutex_unlock(&buffer->mutex);
@@ -299,8 +305,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct configfs_buffer *buffer = file->private_data;
- struct dentry *dentry = file->f_path.dentry;
- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
void *tbuf = NULL;
ssize_t len;
@@ -316,8 +320,8 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
/* buffer grows? */
if (*ppos + count > buffer->bin_buffer_size) {
- if (bin_attr->cb_max_size &&
- *ppos + count > bin_attr->cb_max_size) {
+ if (buffer->cb_max_size &&
+ *ppos + count > buffer->cb_max_size) {
len = -EFBIG;
goto out;
}
@@ -349,31 +353,51 @@ out:
return len;
}
-static int check_perm(struct inode * inode, struct file * file, int type)
+static int __configfs_open_file(struct inode *inode, struct file *file, int type)
{
- struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
- struct configfs_attribute * attr = to_attr(file->f_path.dentry);
- struct configfs_bin_attribute *bin_attr = NULL;
- struct configfs_buffer * buffer;
- struct configfs_item_operations * ops = NULL;
- int error = 0;
+ struct dentry *dentry = file->f_path.dentry;
+ struct configfs_fragment *frag = to_frag(file);
+ struct configfs_attribute *attr;
+ struct configfs_buffer *buffer;
+ int error;
- if (!item || !attr)
- goto Einval;
+ error = -ENOMEM;
+ buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL);
+ if (!buffer)
+ goto out;
- if (type & CONFIGFS_ITEM_BIN_ATTR)
- bin_attr = to_bin_attr(file->f_path.dentry);
+ error = -ENOENT;
+ down_read(&frag->frag_sem);
+ if (unlikely(frag->frag_dead))
+ goto out_free_buffer;
- /* Grab the module reference for this attribute if we have one */
- if (!try_module_get(attr->ca_owner)) {
- error = -ENODEV;
- goto Done;
+ error = -EINVAL;
+ buffer->item = to_item(dentry->d_parent);
+ if (!buffer->item)
+ goto out_free_buffer;
+
+ attr = to_attr(dentry);
+ if (!attr)
+ goto out_put_item;
+
+ if (type & CONFIGFS_ITEM_BIN_ATTR) {
+ buffer->bin_attr = to_bin_attr(dentry);
+ buffer->cb_max_size = buffer->bin_attr->cb_max_size;
+ } else {
+ buffer->attr = attr;
}
- if (item->ci_type)
- ops = item->ci_type->ct_item_ops;
- else
- goto Eaccess;
+ buffer->owner = attr->ca_owner;
+ /* Grab the module reference for this attribute if we have one */
+ error = -ENODEV;
+ if (!try_module_get(buffer->owner))
+ goto out_put_item;
+
+ error = -EACCES;
+ if (!buffer->item->ci_type)
+ goto out_put_module;
+
+ buffer->ops = buffer->item->ci_type->ct_item_ops;
/* File needs write support.
* The inode's perms must say it's ok,
@@ -381,13 +405,11 @@ static int check_perm(struct inode * inode, struct file * file, int type)
*/
if (file->f_mode & FMODE_WRITE) {
if (!(inode->i_mode & S_IWUGO))
- goto Eaccess;
-
+ goto out_put_module;
if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
- goto Eaccess;
-
- if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
- goto Eaccess;
+ goto out_put_module;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write)
+ goto out_put_module;
}
/* File needs read support.
@@ -396,92 +418,72 @@ static int check_perm(struct inode * inode, struct file * file, int type)
*/
if (file->f_mode & FMODE_READ) {
if (!(inode->i_mode & S_IRUGO))
- goto Eaccess;
-
+ goto out_put_module;
if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
- goto Eaccess;
-
- if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
- goto Eaccess;
+ goto out_put_module;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read)
+ goto out_put_module;
}
- /* No error? Great, allocate a buffer for the file, and store it
- * it in file->private_data for easy access.
- */
- buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
- if (!buffer) {
- error = -ENOMEM;
- goto Enomem;
- }
mutex_init(&buffer->mutex);
buffer->needs_read_fill = 1;
buffer->read_in_progress = false;
buffer->write_in_progress = false;
- buffer->ops = ops;
file->private_data = buffer;
- goto Done;
+ up_read(&frag->frag_sem);
+ return 0;
- Einval:
- error = -EINVAL;
- goto Done;
- Eaccess:
- error = -EACCES;
- Enomem:
- module_put(attr->ca_owner);
- Done:
- if (error && item)
- config_item_put(item);
+out_put_module:
+ module_put(buffer->owner);
+out_put_item:
+ config_item_put(buffer->item);
+out_free_buffer:
+ up_read(&frag->frag_sem);
+ kfree(buffer);
+out:
return error;
}
static int configfs_release(struct inode *inode, struct file *filp)
{
- struct config_item * item = to_item(filp->f_path.dentry->d_parent);
- struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
- struct module * owner = attr->ca_owner;
- struct configfs_buffer * buffer = filp->private_data;
-
- if (item)
- config_item_put(item);
- /* After this point, attr should not be accessed. */
- module_put(owner);
-
- if (buffer) {
- if (buffer->page)
- free_page((unsigned long)buffer->page);
- mutex_destroy(&buffer->mutex);
- kfree(buffer);
- }
+ struct configfs_buffer *buffer = filp->private_data;
+
+ module_put(buffer->owner);
+ if (buffer->page)
+ free_page((unsigned long)buffer->page);
+ mutex_destroy(&buffer->mutex);
+ kfree(buffer);
return 0;
}
static int configfs_open_file(struct inode *inode, struct file *filp)
{
- return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+ return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR);
}
static int configfs_open_bin_file(struct inode *inode, struct file *filp)
{
- return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+ return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
}
-static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+static int configfs_release_bin_file(struct inode *inode, struct file *file)
{
- struct configfs_buffer *buffer = filp->private_data;
- struct dentry *dentry = filp->f_path.dentry;
- struct config_item *item = to_item(dentry->d_parent);
- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
- ssize_t len = 0;
- int ret;
+ struct configfs_buffer *buffer = file->private_data;
buffer->read_in_progress = false;
if (buffer->write_in_progress) {
+ struct configfs_fragment *frag = to_frag(file);
buffer->write_in_progress = false;
- len = bin_attr->write(item, buffer->bin_buffer,
- buffer->bin_buffer_size);
-
+ down_read(&frag->frag_sem);
+ if (!frag->frag_dead) {
+ /* result of ->release() is ignored */
+ buffer->bin_attr->write(buffer->item,
+ buffer->bin_buffer,
+ buffer->bin_buffer_size);
+ }
+ up_read(&frag->frag_sem);
/* vfree on NULL is safe */
vfree(buffer->bin_buffer);
buffer->bin_buffer = NULL;
@@ -489,10 +491,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp)
buffer->needs_read_fill = 1;
}
- ret = configfs_release(inode, filp);
- if (len < 0)
- return len;
- return ret;
+ configfs_release(inode, file);
+ return 0;
}
@@ -527,7 +527,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
- CONFIGFS_ITEM_ATTR);
+ CONFIGFS_ITEM_ATTR, parent_sd->s_frag);
inode_unlock(d_inode(dir));
return error;
@@ -549,7 +549,7 @@ int configfs_create_bin_file(struct config_item *item,
inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
- CONFIGFS_ITEM_BIN_ATTR);
+ CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag);
inode_unlock(dir->d_inode);
return error;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 5fdf24877c17..ff5a1746cbae 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -7,6 +7,8 @@ config FS_ENCRYPTION
select CRYPTO_ECB
select CRYPTO_XTS
select CRYPTO_CTS
+ select CRYPTO_SHA512
+ select CRYPTO_HMAC
select KEYS
help
Enable encryption of files and directories. This
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 4f0df5e682e4..232e2bb5a337 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,5 +1,13 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
-fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
+fscrypto-y := crypto.o \
+ fname.o \
+ hkdf.o \
+ hooks.o \
+ keyring.o \
+ keysetup.o \
+ keysetup_v1.o \
+ policy.o
+
fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 45c3d0427fb2..32a7ad0098cc 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -141,7 +141,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
memset(iv, 0, ci->ci_mode->ivsize);
iv->lblk_num = cpu_to_le64(lblk_num);
- if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY)
+ if (fscrypt_is_direct_key_policy(&ci->ci_policy))
memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
if (ci->ci_essiv_tfm != NULL)
@@ -188,10 +188,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
- fscrypt_err(inode->i_sb,
- "%scryption failed for inode %lu, block %llu: %d",
- (rw == FS_DECRYPT ? "de" : "en"),
- inode->i_ino, lblk_num, res);
+ fscrypt_err(inode, "%scryption failed for block %llu: %d",
+ (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
return res;
}
return 0;
@@ -453,7 +451,7 @@ fail:
return res;
}
-void fscrypt_msg(struct super_block *sb, const char *level,
+void fscrypt_msg(const struct inode *inode, const char *level,
const char *fmt, ...)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -467,8 +465,9 @@ void fscrypt_msg(struct super_block *sb, const char *level,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- if (sb)
- printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf);
+ if (inode)
+ printk("%sfscrypt (%s, inode %lu): %pV\n",
+ level, inode->i_sb->s_id, inode->i_ino, &vaf);
else
printk("%sfscrypt: %pV\n", level, &vaf);
va_end(args);
@@ -479,6 +478,8 @@ void fscrypt_msg(struct super_block *sb, const char *level,
*/
static int __init fscrypt_init(void)
{
+ int err = -ENOMEM;
+
/*
* Use an unbound workqueue to allow bios to be decrypted in parallel
* even when they happen to complete on the same CPU. This sacrifices
@@ -501,31 +502,19 @@ static int __init fscrypt_init(void)
if (!fscrypt_info_cachep)
goto fail_free_ctx;
+ err = fscrypt_init_keyring();
+ if (err)
+ goto fail_free_info;
+
return 0;
+fail_free_info:
+ kmem_cache_destroy(fscrypt_info_cachep);
fail_free_ctx:
kmem_cache_destroy(fscrypt_ctx_cachep);
fail_free_queue:
destroy_workqueue(fscrypt_read_workqueue);
fail:
- return -ENOMEM;
-}
-module_init(fscrypt_init)
-
-/**
- * fscrypt_exit() - Shutdown the fs encryption system
- */
-static void __exit fscrypt_exit(void)
-{
- fscrypt_destroy();
-
- if (fscrypt_read_workqueue)
- destroy_workqueue(fscrypt_read_workqueue);
- kmem_cache_destroy(fscrypt_ctx_cachep);
- kmem_cache_destroy(fscrypt_info_cachep);
-
- fscrypt_essiv_cleanup();
+ return err;
}
-module_exit(fscrypt_exit);
-
-MODULE_LICENSE("GPL");
+late_initcall(fscrypt_init)
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 00d150ff3033..3da3707c10e3 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -71,9 +71,7 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- fscrypt_err(inode->i_sb,
- "Filename encryption failed for inode %lu: %d",
- inode->i_ino, res);
+ fscrypt_err(inode, "Filename encryption failed: %d", res);
return res;
}
@@ -117,9 +115,7 @@ static int fname_decrypt(struct inode *inode,
res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- fscrypt_err(inode->i_sb,
- "Filename decryption failed for inode %lu: %d",
- inode->i_ino, res);
+ fscrypt_err(inode, "Filename decryption failed: %d", res);
return res;
}
@@ -127,44 +123,45 @@ static int fname_decrypt(struct inode *inode,
return 0;
}
-static const char *lookup_table =
+static const char lookup_table[65] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
/**
- * digest_encode() -
+ * base64_encode() -
*
- * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * Encodes the input string using characters from the set [A-Za-z0-9+,].
* The encoded string is roughly 4/3 times the size of the input string.
+ *
+ * Return: length of the encoded string
*/
-static int digest_encode(const char *src, int len, char *dst)
+static int base64_encode(const u8 *src, int len, char *dst)
{
- int i = 0, bits = 0, ac = 0;
+ int i, bits = 0, ac = 0;
char *cp = dst;
- while (i < len) {
- ac += (((unsigned char) src[i]) << bits);
+ for (i = 0; i < len; i++) {
+ ac += src[i] << bits;
bits += 8;
do {
*cp++ = lookup_table[ac & 0x3f];
ac >>= 6;
bits -= 6;
} while (bits >= 6);
- i++;
}
if (bits)
*cp++ = lookup_table[ac & 0x3f];
return cp - dst;
}
-static int digest_decode(const char *src, int len, char *dst)
+static int base64_decode(const char *src, int len, u8 *dst)
{
- int i = 0, bits = 0, ac = 0;
+ int i, bits = 0, ac = 0;
const char *p;
- char *cp = dst;
+ u8 *cp = dst;
- while (i < len) {
+ for (i = 0; i < len; i++) {
p = strchr(lookup_table, src[i]);
if (p == NULL || src[i] == 0)
return -2;
@@ -175,7 +172,6 @@ static int digest_decode(const char *src, int len, char *dst)
ac >>= 8;
bits -= 8;
}
- i++;
}
if (ac)
return -1;
@@ -185,8 +181,9 @@ static int digest_decode(const char *src, int len, char *dst)
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
u32 max_len, u32 *encrypted_len_ret)
{
- int padding = 4 << (inode->i_crypt_info->ci_flags &
- FS_POLICY_FLAGS_PAD_MASK);
+ const struct fscrypt_info *ci = inode->i_crypt_info;
+ int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAGS_PAD_MASK);
u32 encrypted_len;
if (orig_len > max_len)
@@ -272,7 +269,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
return fname_decrypt(inode, iname, oname);
if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
- oname->len = digest_encode(iname->name, iname->len,
+ oname->len = base64_encode(iname->name, iname->len,
oname->name);
return 0;
}
@@ -287,7 +284,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
FSCRYPT_FNAME_DIGEST_SIZE);
oname->name[0] = '_';
- oname->len = 1 + digest_encode((const char *)&digested_name,
+ oname->len = 1 + base64_encode((const u8 *)&digested_name,
sizeof(digested_name), oname->name + 1);
return 0;
}
@@ -380,8 +377,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = digest_decode(iname->name + digested, iname->len - digested,
- fname->crypto_buf.name);
+ ret = base64_decode(iname->name + digested, iname->len - digested,
+ fname->crypto_buf.name);
if (ret < 0) {
ret = -ENOENT;
goto errout;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8978eec9d766..e84efc01512e 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -4,9 +4,8 @@
*
* Copyright (C) 2015, Google, Inc.
*
- * This contains encryption key functions.
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
*/
#ifndef _FSCRYPT_PRIVATE_H
@@ -15,30 +14,133 @@
#include <linux/fscrypt.h>
#include <crypto/hash.h>
-/* Encryption parameters */
+#define CONST_STRLEN(str) (sizeof(str) - 1)
+
#define FS_KEY_DERIVATION_NONCE_SIZE 16
-/**
- * Encryption context for inode
- *
- * Protector format:
- * 1 byte: Protector format (1 = this version)
- * 1 byte: File contents encryption mode
- * 1 byte: File names encryption mode
- * 1 byte: Flags
- * 8 bytes: Master Key descriptor
- * 16 bytes: Encryption Key derivation nonce
- */
-struct fscrypt_context {
- u8 format;
+#define FSCRYPT_MIN_KEY_SIZE 16
+
+#define FSCRYPT_CONTEXT_V1 1
+#define FSCRYPT_CONTEXT_V2 2
+
+struct fscrypt_context_v1 {
+ u8 version; /* FSCRYPT_CONTEXT_V1 */
u8 contents_encryption_mode;
u8 filenames_encryption_mode;
u8 flags;
- u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
-} __packed;
+};
-#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
+struct fscrypt_context_v2 {
+ u8 version; /* FSCRYPT_CONTEXT_V2 */
+ u8 contents_encryption_mode;
+ u8 filenames_encryption_mode;
+ u8 flags;
+ u8 __reserved[4];
+ u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+};
+
+/**
+ * fscrypt_context - the encryption context of an inode
+ *
+ * This is the on-disk equivalent of an fscrypt_policy, stored alongside each
+ * encrypted file usually in a hidden extended attribute. It contains the
+ * fields from the fscrypt_policy, in order to identify the encryption algorithm
+ * and key with which the file is encrypted. It also contains a nonce that was
+ * randomly generated by fscrypt itself; this is used as KDF input or as a tweak
+ * to cause different files to be encrypted differently.
+ */
+union fscrypt_context {
+ u8 version;
+ struct fscrypt_context_v1 v1;
+ struct fscrypt_context_v2 v2;
+};
+
+/*
+ * Return the size expected for the given fscrypt_context based on its version
+ * number, or 0 if the context version is unrecognized.
+ */
+static inline int fscrypt_context_size(const union fscrypt_context *ctx)
+{
+ switch (ctx->version) {
+ case FSCRYPT_CONTEXT_V1:
+ BUILD_BUG_ON(sizeof(ctx->v1) != 28);
+ return sizeof(ctx->v1);
+ case FSCRYPT_CONTEXT_V2:
+ BUILD_BUG_ON(sizeof(ctx->v2) != 40);
+ return sizeof(ctx->v2);
+ }
+ return 0;
+}
+
+#undef fscrypt_policy
+union fscrypt_policy {
+ u8 version;
+ struct fscrypt_policy_v1 v1;
+ struct fscrypt_policy_v2 v2;
+};
+
+/*
+ * Return the size expected for the given fscrypt_policy based on its version
+ * number, or 0 if the policy version is unrecognized.
+ */
+static inline int fscrypt_policy_size(const union fscrypt_policy *policy)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ return sizeof(policy->v1);
+ case FSCRYPT_POLICY_V2:
+ return sizeof(policy->v2);
+ }
+ return 0;
+}
+
+/* Return the contents encryption mode of a valid encryption policy */
+static inline u8
+fscrypt_policy_contents_mode(const union fscrypt_policy *policy)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ return policy->v1.contents_encryption_mode;
+ case FSCRYPT_POLICY_V2:
+ return policy->v2.contents_encryption_mode;
+ }
+ BUG();
+}
+
+/* Return the filenames encryption mode of a valid encryption policy */
+static inline u8
+fscrypt_policy_fnames_mode(const union fscrypt_policy *policy)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ return policy->v1.filenames_encryption_mode;
+ case FSCRYPT_POLICY_V2:
+ return policy->v2.filenames_encryption_mode;
+ }
+ BUG();
+}
+
+/* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */
+static inline u8
+fscrypt_policy_flags(const union fscrypt_policy *policy)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ return policy->v1.flags;
+ case FSCRYPT_POLICY_V2:
+ return policy->v2.flags;
+ }
+ BUG();
+}
+
+static inline bool
+fscrypt_is_direct_key_policy(const union fscrypt_policy *policy)
+{
+ return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY;
+}
/**
* For encrypted symlinks, the ciphertext length is stored at the beginning
@@ -68,23 +170,37 @@ struct fscrypt_info {
struct crypto_cipher *ci_essiv_tfm;
/*
- * Encryption mode used for this inode. It corresponds to either
- * ci_data_mode or ci_filename_mode, depending on the inode type.
+ * Encryption mode used for this inode. It corresponds to either the
+ * contents or filenames encryption mode, depending on the inode type.
*/
struct fscrypt_mode *ci_mode;
+ /* Back-pointer to the inode */
+ struct inode *ci_inode;
+
+ /*
+ * The master key with which this inode was unlocked (decrypted). This
+ * will be NULL if the master key was found in a process-subscribed
+ * keyring rather than in the filesystem-level keyring.
+ */
+ struct key *ci_master_key;
+
+ /*
+ * Link in list of inodes that were unlocked with the master key.
+ * Only used when ->ci_master_key is set.
+ */
+ struct list_head ci_master_key_link;
+
/*
- * If non-NULL, then this inode uses a master key directly rather than a
- * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm.
- * Otherwise, this inode uses a derived key.
+ * If non-NULL, then encryption is done using the master key directly
+ * and ci_ctfm will equal ci_direct_key->dk_ctfm.
*/
- struct fscrypt_master_key *ci_master_key;
+ struct fscrypt_direct_key *ci_direct_key;
- /* fields from the fscrypt_context */
- u8 ci_data_mode;
- u8 ci_filename_mode;
- u8 ci_flags;
- u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ /* The encryption policy used by this inode */
+ union fscrypt_policy ci_policy;
+
+ /* This inode's nonce, copied from the fscrypt_context */
u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
};
@@ -98,16 +214,16 @@ typedef enum {
static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
u32 filenames_mode)
{
- if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
- filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+ if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
+ filenames_mode == FSCRYPT_MODE_AES_128_CTS)
return true;
- if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
- filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+ if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+ filenames_mode == FSCRYPT_MODE_AES_256_CTS)
return true;
- if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM &&
- filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM)
+ if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
+ filenames_mode == FSCRYPT_MODE_ADIANTUM)
return true;
return false;
@@ -125,12 +241,12 @@ extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
extern const struct dentry_operations fscrypt_d_ops;
extern void __printf(3, 4) __cold
-fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
-#define fscrypt_warn(sb, fmt, ...) \
- fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
-#define fscrypt_err(sb, fmt, ...) \
- fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define fscrypt_warn(inode, fmt, ...) \
+ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fscrypt_err(inode, fmt, ...) \
+ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
#define FSCRYPT_MAX_IV_SIZE 32
@@ -155,7 +271,172 @@ extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
u32 orig_len, u32 max_len,
u32 *encrypted_len_ret);
-/* keyinfo.c */
+/* hkdf.c */
+
+struct fscrypt_hkdf {
+ struct crypto_shash *hmac_tfm;
+};
+
+extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
+ unsigned int master_key_size);
+
+/*
+ * The list of contexts in which fscrypt uses HKDF. These values are used as
+ * the first byte of the HKDF application-specific info string to guarantee that
+ * info strings are never repeated between contexts. This ensures that all HKDF
+ * outputs are unique and cryptographically isolated, i.e. knowledge of one
+ * output doesn't reveal another.
+ */
+#define HKDF_CONTEXT_KEY_IDENTIFIER 1
+#define HKDF_CONTEXT_PER_FILE_KEY 2
+#define HKDF_CONTEXT_PER_MODE_KEY 3
+
+extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen);
+
+extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+
+/* keyring.c */
+
+/*
+ * fscrypt_master_key_secret - secret key material of an in-use master key
+ */
+struct fscrypt_master_key_secret {
+
+ /*
+ * For v2 policy keys: HKDF context keyed by this master key.
+ * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
+ */
+ struct fscrypt_hkdf hkdf;
+
+ /* Size of the raw key in bytes. Set even if ->raw isn't set. */
+ u32 size;
+
+ /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
+ u8 raw[FSCRYPT_MAX_KEY_SIZE];
+
+} __randomize_layout;
+
+/*
+ * fscrypt_master_key - an in-use master key
+ *
+ * This represents a master encryption key which has been added to the
+ * filesystem and can be used to "unlock" the encrypted files which were
+ * encrypted with it.
+ */
+struct fscrypt_master_key {
+
+ /*
+ * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is
+ * executed, this is wiped and no new inodes can be unlocked with this
+ * key; however, there may still be inodes in ->mk_decrypted_inodes
+ * which could not be evicted. As long as some inodes still remain,
+ * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
+ * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+ *
+ * Locking: protected by key->sem (outer) and mk_secret_sem (inner).
+ * The reason for two locks is that key->sem also protects modifying
+ * mk_users, which ranks it above the semaphore for the keyring key
+ * type, which is in turn above page faults (via keyring_read). But
+ * sometimes filesystems call fscrypt_get_encryption_info() from within
+ * a transaction, which ranks it below page faults. So we need a
+ * separate lock which protects mk_secret but not also mk_users.
+ */
+ struct fscrypt_master_key_secret mk_secret;
+ struct rw_semaphore mk_secret_sem;
+
+ /*
+ * For v1 policy keys: an arbitrary key descriptor which was assigned by
+ * userspace (->descriptor).
+ *
+ * For v2 policy keys: a cryptographic hash of this key (->identifier).
+ */
+ struct fscrypt_key_specifier mk_spec;
+
+ /*
+ * Keyring which contains a key of type 'key_type_fscrypt_user' for each
+ * user who has added this key. Normally each key will be added by just
+ * one user, but it's possible that multiple users share a key, and in
+ * that case we need to keep track of those users so that one user can't
+ * remove the key before the others want it removed too.
+ *
+ * This is NULL for v1 policy keys; those can only be added by root.
+ *
+ * Locking: in addition to this keyrings own semaphore, this is
+ * protected by the master key's key->sem, so we can do atomic
+ * search+insert. It can also be searched without taking any locks, but
+ * in that case the returned key may have already been removed.
+ */
+ struct key *mk_users;
+
+ /*
+ * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
+ * Once this goes to 0, the master key is removed from ->s_master_keys.
+ * The 'struct fscrypt_master_key' will continue to live as long as the
+ * 'struct key' whose payload it is, but we won't let this reference
+ * count rise again.
+ */
+ refcount_t mk_refcount;
+
+ /*
+ * List of inodes that were unlocked using this key. This allows the
+ * inodes to be evicted efficiently if the key is removed.
+ */
+ struct list_head mk_decrypted_inodes;
+ spinlock_t mk_decrypted_inodes_lock;
+
+ /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */
+ struct crypto_skcipher *mk_mode_keys[__FSCRYPT_MODE_MAX + 1];
+
+} __randomize_layout;
+
+static inline bool
+is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
+{
+ /*
+ * The READ_ONCE() is only necessary for fscrypt_drop_inode() and
+ * fscrypt_key_describe(). These run in atomic context, so they can't
+ * take ->mk_secret_sem and thus 'secret' can change concurrently which
+ * would be a data race. But they only need to know whether the secret
+ * *was* present at the time of check, so READ_ONCE() suffices.
+ */
+ return READ_ONCE(secret->size) != 0;
+}
+
+static inline const char *master_key_spec_type(
+ const struct fscrypt_key_specifier *spec)
+{
+ switch (spec->type) {
+ case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+ return "descriptor";
+ case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+ return "identifier";
+ }
+ return "[unknown]";
+}
+
+static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
+{
+ switch (spec->type) {
+ case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+ return FSCRYPT_KEY_DESCRIPTOR_SIZE;
+ case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+ return FSCRYPT_KEY_IDENTIFIER_SIZE;
+ }
+ return 0;
+}
+
+extern struct key *
+fscrypt_find_master_key(struct super_block *sb,
+ const struct fscrypt_key_specifier *mk_spec);
+
+extern int fscrypt_verify_key_added(struct super_block *sb,
+ const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
+
+extern int __init fscrypt_init_keyring(void);
+
+/* keysetup.c */
struct fscrypt_mode {
const char *friendly_name;
@@ -166,6 +447,36 @@ struct fscrypt_mode {
bool needs_essiv;
};
-extern void __exit fscrypt_essiv_cleanup(void);
+static inline bool
+fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode)
+{
+ return mode->ivsize >= offsetofend(union fscrypt_iv, nonce);
+}
+
+extern struct crypto_skcipher *
+fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
+ const struct inode *inode);
+
+extern int fscrypt_set_derived_key(struct fscrypt_info *ci,
+ const u8 *derived_key);
+
+/* keysetup_v1.c */
+
+extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
+
+extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+ const u8 *raw_master_key);
+
+extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+ struct fscrypt_info *ci);
+/* policy.c */
+
+extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
+ const union fscrypt_policy *policy2);
+extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
+ const struct inode *inode);
+extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
+ const union fscrypt_context *ctx_u,
+ int ctx_size);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
new file mode 100644
index 000000000000..f21873e1b467
--- /dev/null
+++ b/fs/crypto/hkdf.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
+ * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010):
+ * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
+ *
+ * This is used to derive keys from the fscrypt master keys.
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
+#include "fscrypt_private.h"
+
+/*
+ * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
+ * SHA-512 because it is reasonably secure and efficient; and since it produces
+ * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
+ * entropy from the master key and requires only one iteration of HKDF-Expand.
+ */
+#define HKDF_HMAC_ALG "hmac(sha512)"
+#define HKDF_HASHLEN SHA512_DIGEST_SIZE
+
+/*
+ * HKDF consists of two steps:
+ *
+ * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from
+ * the input keying material and optional salt.
+ * 2. HKDF-Expand: expand the pseudorandom key into output keying material of
+ * any length, parameterized by an application-specific info string.
+ *
+ * HKDF-Extract can be skipped if the input is already a pseudorandom key of
+ * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take
+ * shorter keys, and we don't want to force users of those modes to provide
+ * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No
+ * salt is used, since fscrypt master keys should already be pseudorandom and
+ * there's no way to persist a random salt per master key from kernel mode.
+ */
+
+/* HKDF-Extract (RFC 5869 section 2.2), unsalted */
+static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
+ unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
+{
+ static const u8 default_salt[HKDF_HASHLEN];
+ SHASH_DESC_ON_STACK(desc, hmac_tfm);
+ int err;
+
+ err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
+ if (err)
+ return err;
+
+ desc->tfm = hmac_tfm;
+ err = crypto_shash_digest(desc, ikm, ikmlen, prk);
+ shash_desc_zero(desc);
+ return err;
+}
+
+/*
+ * Compute HKDF-Extract using the given master key as the input keying material,
+ * and prepare an HMAC transform object keyed by the resulting pseudorandom key.
+ *
+ * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many
+ * times without having to recompute HKDF-Extract each time.
+ */
+int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
+ unsigned int master_key_size)
+{
+ struct crypto_shash *hmac_tfm;
+ u8 prk[HKDF_HASHLEN];
+ int err;
+
+ hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0);
+ if (IS_ERR(hmac_tfm)) {
+ fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
+ PTR_ERR(hmac_tfm));
+ return PTR_ERR(hmac_tfm);
+ }
+
+ if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
+ err = -EINVAL;
+ goto err_free_tfm;
+ }
+
+ err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk);
+ if (err)
+ goto err_free_tfm;
+
+ err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
+ if (err)
+ goto err_free_tfm;
+
+ hkdf->hmac_tfm = hmac_tfm;
+ goto out;
+
+err_free_tfm:
+ crypto_free_shash(hmac_tfm);
+out:
+ memzero_explicit(prk, sizeof(prk));
+ return err;
+}
+
+/*
+ * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which
+ * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen'
+ * bytes of output keying material parameterized by the application-specific
+ * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context'
+ * byte. This is thread-safe and may be called by multiple threads in parallel.
+ *
+ * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt
+ * adds to its application-specific info strings to guarantee that it doesn't
+ * accidentally repeat an info string when using HKDF for different purposes.)
+ */
+int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen)
+{
+ SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
+ u8 prefix[9];
+ unsigned int i;
+ int err;
+ const u8 *prev = NULL;
+ u8 counter = 1;
+ u8 tmp[HKDF_HASHLEN];
+
+ if (WARN_ON(okmlen > 255 * HKDF_HASHLEN))
+ return -EINVAL;
+
+ desc->tfm = hkdf->hmac_tfm;
+
+ memcpy(prefix, "fscrypt\0", 8);
+ prefix[8] = context;
+
+ for (i = 0; i < okmlen; i += HKDF_HASHLEN) {
+
+ err = crypto_shash_init(desc);
+ if (err)
+ goto out;
+
+ if (prev) {
+ err = crypto_shash_update(desc, prev, HKDF_HASHLEN);
+ if (err)
+ goto out;
+ }
+
+ err = crypto_shash_update(desc, prefix, sizeof(prefix));
+ if (err)
+ goto out;
+
+ err = crypto_shash_update(desc, info, infolen);
+ if (err)
+ goto out;
+
+ BUILD_BUG_ON(sizeof(counter) != 1);
+ if (okmlen - i < HKDF_HASHLEN) {
+ err = crypto_shash_finup(desc, &counter, 1, tmp);
+ if (err)
+ goto out;
+ memcpy(&okm[i], tmp, okmlen - i);
+ memzero_explicit(tmp, sizeof(tmp));
+ } else {
+ err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
+ if (err)
+ goto out;
+ }
+ counter++;
+ prev = &okm[i];
+ }
+ err = 0;
+out:
+ if (unlikely(err))
+ memzero_explicit(okm, okmlen); /* so caller doesn't need to */
+ shash_desc_zero(desc);
+ return err;
+}
+
+void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf)
+{
+ crypto_free_shash(hkdf->hmac_tfm);
+}
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index c1d6715d88e9..bb3b7fcfdd48 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -39,9 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp)
dir = dget_parent(file_dentry(filp));
if (IS_ENCRYPTED(d_inode(dir)) &&
!fscrypt_has_permitted_context(d_inode(dir), inode)) {
- fscrypt_warn(inode->i_sb,
- "inconsistent encryption contexts: %lu/%lu",
- d_inode(dir)->i_ino, inode->i_ino);
+ fscrypt_warn(inode,
+ "Inconsistent encryption context (parent directory: %lu)",
+ d_inode(dir)->i_ino);
err = -EPERM;
}
dput(dir);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
deleted file mode 100644
index 207ebed918c1..000000000000
--- a/fs/crypto/keyinfo.c
+++ /dev/null
@@ -1,611 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * key management facility for FS encryption support.
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions.
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-
-#include <keys/user-type.h>
-#include <linux/hashtable.h>
-#include <linux/scatterlist.h>
-#include <crypto/aes.h>
-#include <crypto/algapi.h>
-#include <crypto/sha.h>
-#include <crypto/skcipher.h>
-#include "fscrypt_private.h"
-
-static struct crypto_shash *essiv_hash_tfm;
-
-/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */
-static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
-static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
-
-/*
- * Key derivation function. This generates the derived key by encrypting the
- * master key with AES-128-ECB using the inode's nonce as the AES key.
- *
- * The master key must be at least as long as the derived key. If the master
- * key is longer, then only the first 'derived_keysize' bytes are used.
- */
-static int derive_key_aes(const u8 *master_key,
- const struct fscrypt_context *ctx,
- u8 *derived_key, unsigned int derived_keysize)
-{
- int res = 0;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
- struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce));
- if (res < 0)
- goto out;
-
- sg_init_one(&src_sg, master_key, derived_keysize);
- sg_init_one(&dst_sg, derived_key, derived_keysize);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
- NULL);
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-out:
- skcipher_request_free(req);
- crypto_free_skcipher(tfm);
- return res;
-}
-
-/*
- * Search the current task's subscribed keyrings for a "logon" key with
- * description prefix:descriptor, and if found acquire a read lock on it and
- * return a pointer to its validated payload in *payload_ret.
- */
-static struct key *
-find_and_lock_process_key(const char *prefix,
- const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
- unsigned int min_keysize,
- const struct fscrypt_key **payload_ret)
-{
- char *description;
- struct key *key;
- const struct user_key_payload *ukp;
- const struct fscrypt_key *payload;
-
- description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
- FS_KEY_DESCRIPTOR_SIZE, descriptor);
- if (!description)
- return ERR_PTR(-ENOMEM);
-
- key = request_key(&key_type_logon, description, NULL);
- kfree(description);
- if (IS_ERR(key))
- return key;
-
- down_read(&key->sem);
- ukp = user_key_payload_locked(key);
-
- if (!ukp) /* was the key revoked before we acquired its semaphore? */
- goto invalid;
-
- payload = (const struct fscrypt_key *)ukp->data;
-
- if (ukp->datalen != sizeof(struct fscrypt_key) ||
- payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
- fscrypt_warn(NULL,
- "key with description '%s' has invalid payload",
- key->description);
- goto invalid;
- }
-
- if (payload->size < min_keysize) {
- fscrypt_warn(NULL,
- "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
- key->description, payload->size, min_keysize);
- goto invalid;
- }
-
- *payload_ret = payload;
- return key;
-
-invalid:
- up_read(&key->sem);
- key_put(key);
- return ERR_PTR(-ENOKEY);
-}
-
-static struct fscrypt_mode available_modes[] = {
- [FS_ENCRYPTION_MODE_AES_256_XTS] = {
- .friendly_name = "AES-256-XTS",
- .cipher_str = "xts(aes)",
- .keysize = 64,
- .ivsize = 16,
- },
- [FS_ENCRYPTION_MODE_AES_256_CTS] = {
- .friendly_name = "AES-256-CTS-CBC",
- .cipher_str = "cts(cbc(aes))",
- .keysize = 32,
- .ivsize = 16,
- },
- [FS_ENCRYPTION_MODE_AES_128_CBC] = {
- .friendly_name = "AES-128-CBC",
- .cipher_str = "cbc(aes)",
- .keysize = 16,
- .ivsize = 16,
- .needs_essiv = true,
- },
- [FS_ENCRYPTION_MODE_AES_128_CTS] = {
- .friendly_name = "AES-128-CTS-CBC",
- .cipher_str = "cts(cbc(aes))",
- .keysize = 16,
- .ivsize = 16,
- },
- [FS_ENCRYPTION_MODE_ADIANTUM] = {
- .friendly_name = "Adiantum",
- .cipher_str = "adiantum(xchacha12,aes)",
- .keysize = 32,
- .ivsize = 32,
- },
-};
-
-static struct fscrypt_mode *
-select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
-{
- if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
- fscrypt_warn(inode->i_sb,
- "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)",
- inode->i_ino, ci->ci_data_mode,
- ci->ci_filename_mode);
- return ERR_PTR(-EINVAL);
- }
-
- if (S_ISREG(inode->i_mode))
- return &available_modes[ci->ci_data_mode];
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- return &available_modes[ci->ci_filename_mode];
-
- WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
- inode->i_ino, (inode->i_mode & S_IFMT));
- return ERR_PTR(-EINVAL);
-}
-
-/* Find the master key, then derive the inode's actual encryption key */
-static int find_and_derive_key(const struct inode *inode,
- const struct fscrypt_context *ctx,
- u8 *derived_key, const struct fscrypt_mode *mode)
-{
- struct key *key;
- const struct fscrypt_key *payload;
- int err;
-
- key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
- ctx->master_key_descriptor,
- mode->keysize, &payload);
- if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
- key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
- ctx->master_key_descriptor,
- mode->keysize, &payload);
- }
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) {
- if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
- fscrypt_warn(inode->i_sb,
- "direct key mode not allowed with %s",
- mode->friendly_name);
- err = -EINVAL;
- } else if (ctx->contents_encryption_mode !=
- ctx->filenames_encryption_mode) {
- fscrypt_warn(inode->i_sb,
- "direct key mode not allowed with different contents and filenames modes");
- err = -EINVAL;
- } else {
- memcpy(derived_key, payload->raw, mode->keysize);
- err = 0;
- }
- } else {
- err = derive_key_aes(payload->raw, ctx, derived_key,
- mode->keysize);
- }
- up_read(&key->sem);
- key_put(key);
- return err;
-}
-
-/* Allocate and key a symmetric cipher object for the given encryption mode */
-static struct crypto_skcipher *
-allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
- const struct inode *inode)
-{
- struct crypto_skcipher *tfm;
- int err;
-
- tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
- if (IS_ERR(tfm)) {
- fscrypt_warn(inode->i_sb,
- "error allocating '%s' transform for inode %lu: %ld",
- mode->cipher_str, inode->i_ino, PTR_ERR(tfm));
- return tfm;
- }
- if (unlikely(!mode->logged_impl_name)) {
- /*
- * fscrypt performance can vary greatly depending on which
- * crypto algorithm implementation is used. Help people debug
- * performance problems by logging the ->cra_driver_name the
- * first time a mode is used. Note that multiple threads can
- * race here, but it doesn't really matter.
- */
- mode->logged_impl_name = true;
- pr_info("fscrypt: %s using implementation \"%s\"\n",
- mode->friendly_name,
- crypto_skcipher_alg(tfm)->base.cra_driver_name);
- }
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
- if (err)
- goto err_free_tfm;
-
- return tfm;
-
-err_free_tfm:
- crypto_free_skcipher(tfm);
- return ERR_PTR(err);
-}
-
-/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */
-struct fscrypt_master_key {
- struct hlist_node mk_node;
- refcount_t mk_refcount;
- const struct fscrypt_mode *mk_mode;
- struct crypto_skcipher *mk_ctfm;
- u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE];
- u8 mk_raw[FS_MAX_KEY_SIZE];
-};
-
-static void free_master_key(struct fscrypt_master_key *mk)
-{
- if (mk) {
- crypto_free_skcipher(mk->mk_ctfm);
- kzfree(mk);
- }
-}
-
-static void put_master_key(struct fscrypt_master_key *mk)
-{
- if (!refcount_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock))
- return;
- hash_del(&mk->mk_node);
- spin_unlock(&fscrypt_master_keys_lock);
-
- free_master_key(mk);
-}
-
-/*
- * Find/insert the given master key into the fscrypt_master_keys table. If
- * found, it is returned with elevated refcount, and 'to_insert' is freed if
- * non-NULL. If not found, 'to_insert' is inserted and returned if it's
- * non-NULL; otherwise NULL is returned.
- */
-static struct fscrypt_master_key *
-find_or_insert_master_key(struct fscrypt_master_key *to_insert,
- const u8 *raw_key, const struct fscrypt_mode *mode,
- const struct fscrypt_info *ci)
-{
- unsigned long hash_key;
- struct fscrypt_master_key *mk;
-
- /*
- * Careful: to avoid potentially leaking secret key bytes via timing
- * information, we must key the hash table by descriptor rather than by
- * raw key, and use crypto_memneq() when comparing raw keys.
- */
-
- BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE);
- memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
-
- spin_lock(&fscrypt_master_keys_lock);
- hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
- if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) != 0)
- continue;
- if (mode != mk->mk_mode)
- continue;
- if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize))
- continue;
- /* using existing tfm with same (descriptor, mode, raw_key) */
- refcount_inc(&mk->mk_refcount);
- spin_unlock(&fscrypt_master_keys_lock);
- free_master_key(to_insert);
- return mk;
- }
- if (to_insert)
- hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key);
- spin_unlock(&fscrypt_master_keys_lock);
- return to_insert;
-}
-
-/* Prepare to encrypt directly using the master key in the given mode */
-static struct fscrypt_master_key *
-fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
- const u8 *raw_key, const struct inode *inode)
-{
- struct fscrypt_master_key *mk;
- int err;
-
- /* Is there already a tfm for this key? */
- mk = find_or_insert_master_key(NULL, raw_key, mode, ci);
- if (mk)
- return mk;
-
- /* Nope, allocate one. */
- mk = kzalloc(sizeof(*mk), GFP_NOFS);
- if (!mk)
- return ERR_PTR(-ENOMEM);
- refcount_set(&mk->mk_refcount, 1);
- mk->mk_mode = mode;
- mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
- if (IS_ERR(mk->mk_ctfm)) {
- err = PTR_ERR(mk->mk_ctfm);
- mk->mk_ctfm = NULL;
- goto err_free_mk;
- }
- memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
- memcpy(mk->mk_raw, raw_key, mode->keysize);
-
- return find_or_insert_master_key(mk, raw_key, mode, ci);
-
-err_free_mk:
- free_master_key(mk);
- return ERR_PTR(err);
-}
-
-static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
-{
- struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
-
- /* init hash transform on demand */
- if (unlikely(!tfm)) {
- struct crypto_shash *prev_tfm;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- fscrypt_warn(NULL,
- "error allocating SHA-256 transform: %ld",
- PTR_ERR(tfm));
- return PTR_ERR(tfm);
- }
- prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
- if (prev_tfm) {
- crypto_free_shash(tfm);
- tfm = prev_tfm;
- }
- }
-
- {
- SHASH_DESC_ON_STACK(desc, tfm);
- desc->tfm = tfm;
-
- return crypto_shash_digest(desc, key, keysize, salt);
- }
-}
-
-static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
- int keysize)
-{
- int err;
- struct crypto_cipher *essiv_tfm;
- u8 salt[SHA256_DIGEST_SIZE];
-
- essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
- if (IS_ERR(essiv_tfm))
- return PTR_ERR(essiv_tfm);
-
- ci->ci_essiv_tfm = essiv_tfm;
-
- err = derive_essiv_salt(raw_key, keysize, salt);
- if (err)
- goto out;
-
- /*
- * Using SHA256 to derive the salt/key will result in AES-256 being
- * used for IV generation. File contents encryption will still use the
- * configured keysize (AES-128) nevertheless.
- */
- err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
- if (err)
- goto out;
-
-out:
- memzero_explicit(salt, sizeof(salt));
- return err;
-}
-
-void __exit fscrypt_essiv_cleanup(void)
-{
- crypto_free_shash(essiv_hash_tfm);
-}
-
-/*
- * Given the encryption mode and key (normally the derived key, but for
- * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's
- * symmetric cipher transform object(s).
- */
-static int setup_crypto_transform(struct fscrypt_info *ci,
- struct fscrypt_mode *mode,
- const u8 *raw_key, const struct inode *inode)
-{
- struct fscrypt_master_key *mk;
- struct crypto_skcipher *ctfm;
- int err;
-
- if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) {
- mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
- if (IS_ERR(mk))
- return PTR_ERR(mk);
- ctfm = mk->mk_ctfm;
- } else {
- mk = NULL;
- ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
- if (IS_ERR(ctfm))
- return PTR_ERR(ctfm);
- }
- ci->ci_master_key = mk;
- ci->ci_ctfm = ctfm;
-
- if (mode->needs_essiv) {
- /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
- WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
- WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY);
-
- err = init_essiv_generator(ci, raw_key, mode->keysize);
- if (err) {
- fscrypt_warn(inode->i_sb,
- "error initializing ESSIV generator for inode %lu: %d",
- inode->i_ino, err);
- return err;
- }
- }
- return 0;
-}
-
-static void put_crypt_info(struct fscrypt_info *ci)
-{
- if (!ci)
- return;
-
- if (ci->ci_master_key) {
- put_master_key(ci->ci_master_key);
- } else {
- crypto_free_skcipher(ci->ci_ctfm);
- crypto_free_cipher(ci->ci_essiv_tfm);
- }
- kmem_cache_free(fscrypt_info_cachep, ci);
-}
-
-int fscrypt_get_encryption_info(struct inode *inode)
-{
- struct fscrypt_info *crypt_info;
- struct fscrypt_context ctx;
- struct fscrypt_mode *mode;
- u8 *raw_key = NULL;
- int res;
-
- if (fscrypt_has_encryption_key(inode))
- return 0;
-
- res = fscrypt_initialize(inode->i_sb->s_cop->flags);
- if (res)
- return res;
-
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res < 0) {
- if (!fscrypt_dummy_context_enabled(inode) ||
- IS_ENCRYPTED(inode))
- return res;
- /* Fake up a context for an unencrypted directory */
- memset(&ctx, 0, sizeof(ctx));
- ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
- ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
- memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
- } else if (res != sizeof(ctx)) {
- return -EINVAL;
- }
-
- if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
- return -EINVAL;
-
- if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
- return -EINVAL;
-
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
- if (!crypt_info)
- return -ENOMEM;
-
- crypt_info->ci_flags = ctx.flags;
- crypt_info->ci_data_mode = ctx.contents_encryption_mode;
- crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
- memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
- memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
-
- mode = select_encryption_mode(crypt_info, inode);
- if (IS_ERR(mode)) {
- res = PTR_ERR(mode);
- goto out;
- }
- WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
- crypt_info->ci_mode = mode;
-
- /*
- * This cannot be a stack buffer because it may be passed to the
- * scatterlist crypto API as part of key derivation.
- */
- res = -ENOMEM;
- raw_key = kmalloc(mode->keysize, GFP_NOFS);
- if (!raw_key)
- goto out;
-
- res = find_and_derive_key(inode, &ctx, raw_key, mode);
- if (res)
- goto out;
-
- res = setup_crypto_transform(crypt_info, mode, raw_key, inode);
- if (res)
- goto out;
-
- if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL)
- crypt_info = NULL;
-out:
- if (res == -ENOKEY)
- res = 0;
- put_crypt_info(crypt_info);
- kzfree(raw_key);
- return res;
-}
-EXPORT_SYMBOL(fscrypt_get_encryption_info);
-
-/**
- * fscrypt_put_encryption_info - free most of an inode's fscrypt data
- *
- * Free the inode's fscrypt_info. Filesystems must call this when the inode is
- * being evicted. An RCU grace period need not have elapsed yet.
- */
-void fscrypt_put_encryption_info(struct inode *inode)
-{
- put_crypt_info(inode->i_crypt_info);
- inode->i_crypt_info = NULL;
-}
-EXPORT_SYMBOL(fscrypt_put_encryption_info);
-
-/**
- * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay
- *
- * Free the inode's cached decrypted symlink target, if any. Filesystems must
- * call this after an RCU grace period, just before they free the inode.
- */
-void fscrypt_free_inode(struct inode *inode)
-{
- if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) {
- kfree(inode->i_link);
- inode->i_link = NULL;
- }
-}
-EXPORT_SYMBOL(fscrypt_free_inode);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
new file mode 100644
index 000000000000..c34fa7c61b43
--- /dev/null
+++ b/fs/crypto/keyring.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Filesystem-level keyring for fscrypt
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * This file implements management of fscrypt master keys in the
+ * filesystem-level keyring, including the ioctls:
+ *
+ * - FS_IOC_ADD_ENCRYPTION_KEY
+ * - FS_IOC_REMOVE_ENCRYPTION_KEY
+ * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS
+ * - FS_IOC_GET_ENCRYPTION_KEY_STATUS
+ *
+ * See the "User API" section of Documentation/filesystems/fscrypt.rst for more
+ * information about these ioctls.
+ */
+
+#include <crypto/skcipher.h>
+#include <linux/key-type.h>
+#include <linux/seq_file.h>
+
+#include "fscrypt_private.h"
+
+static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
+{
+ fscrypt_destroy_hkdf(&secret->hkdf);
+ memzero_explicit(secret, sizeof(*secret));
+}
+
+static void move_master_key_secret(struct fscrypt_master_key_secret *dst,
+ struct fscrypt_master_key_secret *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+ memzero_explicit(src, sizeof(*src));
+}
+
+static void free_master_key(struct fscrypt_master_key *mk)
+{
+ size_t i;
+
+ wipe_master_key_secret(&mk->mk_secret);
+
+ for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++)
+ crypto_free_skcipher(mk->mk_mode_keys[i]);
+
+ key_put(mk->mk_users);
+ kzfree(mk);
+}
+
+static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+{
+ if (spec->__reserved)
+ return false;
+ return master_key_spec_len(spec) != 0;
+}
+
+static int fscrypt_key_instantiate(struct key *key,
+ struct key_preparsed_payload *prep)
+{
+ key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
+ return 0;
+}
+
+static void fscrypt_key_destroy(struct key *key)
+{
+ free_master_key(key->payload.data[0]);
+}
+
+static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
+{
+ seq_puts(m, key->description);
+
+ if (key_is_positive(key)) {
+ const struct fscrypt_master_key *mk = key->payload.data[0];
+
+ if (!is_master_key_secret_present(&mk->mk_secret))
+ seq_puts(m, ": secret removed");
+ }
+}
+
+/*
+ * Type of key in ->s_master_keys. Each key of this type represents a master
+ * key which has been added to the filesystem. Its payload is a
+ * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents
+ * users from adding keys of this type via the keyrings syscalls rather than via
+ * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
+ */
+static struct key_type key_type_fscrypt = {
+ .name = "._fscrypt",
+ .instantiate = fscrypt_key_instantiate,
+ .destroy = fscrypt_key_destroy,
+ .describe = fscrypt_key_describe,
+};
+
+static int fscrypt_user_key_instantiate(struct key *key,
+ struct key_preparsed_payload *prep)
+{
+ /*
+ * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for
+ * each key, regardless of the exact key size. The amount of memory
+ * actually used is greater than the size of the raw key anyway.
+ */
+ return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE);
+}
+
+static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m)
+{
+ seq_puts(m, key->description);
+}
+
+/*
+ * Type of key in ->mk_users. Each key of this type represents a particular
+ * user who has added a particular master key.
+ *
+ * Note that the name of this key type really should be something like
+ * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen
+ * mainly for simplicity of presentation in /proc/keys when read by a non-root
+ * user. And it is expected to be rare that a key is actually added by multiple
+ * users, since users should keep their encryption keys confidential.
+ */
+static struct key_type key_type_fscrypt_user = {
+ .name = ".fscrypt",
+ .instantiate = fscrypt_user_key_instantiate,
+ .describe = fscrypt_user_key_describe,
+};
+
+/* Search ->s_master_keys or ->mk_users */
+static struct key *search_fscrypt_keyring(struct key *keyring,
+ struct key_type *type,
+ const char *description)
+{
+ /*
+ * We need to mark the keyring reference as "possessed" so that we
+ * acquire permission to search it, via the KEY_POS_SEARCH permission.
+ */
+ key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
+
+ keyref = keyring_search(keyref, type, description, false);
+ if (IS_ERR(keyref)) {
+ if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+ keyref = ERR_PTR(-ENOKEY);
+ return ERR_CAST(keyref);
+ }
+ return key_ref_to_ptr(keyref);
+}
+
+#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \
+ (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id))
+
+#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
+
+#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \
+ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \
+ CONST_STRLEN("-users") + 1)
+
+#define FSCRYPT_MK_USER_DESCRIPTION_SIZE \
+ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
+
+static void format_fs_keyring_description(
+ char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
+ const struct super_block *sb)
+{
+ sprintf(description, "fscrypt-%s", sb->s_id);
+}
+
+static void format_mk_description(
+ char description[FSCRYPT_MK_DESCRIPTION_SIZE],
+ const struct fscrypt_key_specifier *mk_spec)
+{
+ sprintf(description, "%*phN",
+ master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
+}
+
+static void format_mk_users_keyring_description(
+ char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE],
+ const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
+ sprintf(description, "fscrypt-%*phN-users",
+ FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier);
+}
+
+static void format_mk_user_description(
+ char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE],
+ const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
+
+ sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE,
+ mk_identifier, __kuid_val(current_fsuid()));
+}
+
+/* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */
+static int allocate_filesystem_keyring(struct super_block *sb)
+{
+ char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
+ struct key *keyring;
+
+ if (sb->s_master_keys)
+ return 0;
+
+ format_fs_keyring_description(description, sb);
+ keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_cred(), KEY_POS_SEARCH |
+ KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring))
+ return PTR_ERR(keyring);
+
+ /* Pairs with READ_ONCE() in fscrypt_find_master_key() */
+ smp_store_release(&sb->s_master_keys, keyring);
+ return 0;
+}
+
+void fscrypt_sb_free(struct super_block *sb)
+{
+ key_put(sb->s_master_keys);
+ sb->s_master_keys = NULL;
+}
+
+/*
+ * Find the specified master key in ->s_master_keys.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+struct key *fscrypt_find_master_key(struct super_block *sb,
+ const struct fscrypt_key_specifier *mk_spec)
+{
+ struct key *keyring;
+ char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+
+ /* pairs with smp_store_release() in allocate_filesystem_keyring() */
+ keyring = READ_ONCE(sb->s_master_keys);
+ if (keyring == NULL)
+ return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
+
+ format_mk_description(description, mk_spec);
+ return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+}
+
+static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
+{
+ char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE];
+ struct key *keyring;
+
+ format_mk_users_keyring_description(description,
+ mk->mk_spec.u.identifier);
+ keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_cred(), KEY_POS_SEARCH |
+ KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring))
+ return PTR_ERR(keyring);
+
+ mk->mk_users = keyring;
+ return 0;
+}
+
+/*
+ * Find the current user's "key" in the master key's ->mk_users.
+ * Returns ERR_PTR(-ENOKEY) if not found.
+ */
+static struct key *find_master_key_user(struct fscrypt_master_key *mk)
+{
+ char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+
+ format_mk_user_description(description, mk->mk_spec.u.identifier);
+ return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user,
+ description);
+}
+
+/*
+ * Give the current user a "key" in ->mk_users. This charges the user's quota
+ * and marks the master key as added by the current user, so that it cannot be
+ * removed by another user with the key. Either the master key's key->sem must
+ * be held for write, or the master key must be still undergoing initialization.
+ */
+static int add_master_key_user(struct fscrypt_master_key *mk)
+{
+ char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+ struct key *mk_user;
+ int err;
+
+ format_mk_user_description(description, mk->mk_spec.u.identifier);
+ mk_user = key_alloc(&key_type_fscrypt_user, description,
+ current_fsuid(), current_gid(), current_cred(),
+ KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL);
+ if (IS_ERR(mk_user))
+ return PTR_ERR(mk_user);
+
+ err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL);
+ key_put(mk_user);
+ return err;
+}
+
+/*
+ * Remove the current user's "key" from ->mk_users.
+ * The master key's key->sem must be held for write.
+ *
+ * Returns 0 if removed, -ENOKEY if not found, or another -errno code.
+ */
+static int remove_master_key_user(struct fscrypt_master_key *mk)
+{
+ struct key *mk_user;
+ int err;
+
+ mk_user = find_master_key_user(mk);
+ if (IS_ERR(mk_user))
+ return PTR_ERR(mk_user);
+ err = key_unlink(mk->mk_users, mk_user);
+ key_put(mk_user);
+ return err;
+}
+
+/*
+ * Allocate a new fscrypt_master_key which contains the given secret, set it as
+ * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
+ * into the given keyring. Synchronized by fscrypt_add_key_mutex.
+ */
+static int add_new_master_key(struct fscrypt_master_key_secret *secret,
+ const struct fscrypt_key_specifier *mk_spec,
+ struct key *keyring)
+{
+ struct fscrypt_master_key *mk;
+ char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+ struct key *key;
+ int err;
+
+ mk = kzalloc(sizeof(*mk), GFP_KERNEL);
+ if (!mk)
+ return -ENOMEM;
+
+ mk->mk_spec = *mk_spec;
+
+ move_master_key_secret(&mk->mk_secret, secret);
+ init_rwsem(&mk->mk_secret_sem);
+
+ refcount_set(&mk->mk_refcount, 1); /* secret is present */
+ INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
+ spin_lock_init(&mk->mk_decrypted_inodes_lock);
+
+ if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
+ err = allocate_master_key_users_keyring(mk);
+ if (err)
+ goto out_free_mk;
+ err = add_master_key_user(mk);
+ if (err)
+ goto out_free_mk;
+ }
+
+ /*
+ * Note that we don't charge this key to anyone's quota, since when
+ * ->mk_users is in use those keys are charged instead, and otherwise
+ * (when ->mk_users isn't in use) only root can add these keys.
+ */
+ format_mk_description(description, mk_spec);
+ key = key_alloc(&key_type_fscrypt, description,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+ KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto out_free_mk;
+ }
+ err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
+ key_put(key);
+ if (err)
+ goto out_free_mk;
+
+ return 0;
+
+out_free_mk:
+ free_master_key(mk);
+ return err;
+}
+
+#define KEY_DEAD 1
+
+static int add_existing_master_key(struct fscrypt_master_key *mk,
+ struct fscrypt_master_key_secret *secret)
+{
+ struct key *mk_user;
+ bool rekey;
+ int err;
+
+ /*
+ * If the current user is already in ->mk_users, then there's nothing to
+ * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+ */
+ if (mk->mk_users) {
+ mk_user = find_master_key_user(mk);
+ if (mk_user != ERR_PTR(-ENOKEY)) {
+ if (IS_ERR(mk_user))
+ return PTR_ERR(mk_user);
+ key_put(mk_user);
+ return 0;
+ }
+ }
+
+ /* If we'll be re-adding ->mk_secret, try to take the reference. */
+ rekey = !is_master_key_secret_present(&mk->mk_secret);
+ if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
+ return KEY_DEAD;
+
+ /* Add the current user to ->mk_users, if applicable. */
+ if (mk->mk_users) {
+ err = add_master_key_user(mk);
+ if (err) {
+ if (rekey && refcount_dec_and_test(&mk->mk_refcount))
+ return KEY_DEAD;
+ return err;
+ }
+ }
+
+ /* Re-add the secret if needed. */
+ if (rekey) {
+ down_write(&mk->mk_secret_sem);
+ move_master_key_secret(&mk->mk_secret, secret);
+ up_write(&mk->mk_secret_sem);
+ }
+ return 0;
+}
+
+static int add_master_key(struct super_block *sb,
+ struct fscrypt_master_key_secret *secret,
+ const struct fscrypt_key_specifier *mk_spec)
+{
+ static DEFINE_MUTEX(fscrypt_add_key_mutex);
+ struct key *key;
+ int err;
+
+ mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
+retry:
+ key = fscrypt_find_master_key(sb, mk_spec);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ if (err != -ENOKEY)
+ goto out_unlock;
+ /* Didn't find the key in ->s_master_keys. Add it. */
+ err = allocate_filesystem_keyring(sb);
+ if (err)
+ goto out_unlock;
+ err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+ } else {
+ /*
+ * Found the key in ->s_master_keys. Re-add the secret if
+ * needed, and add the user to ->mk_users if needed.
+ */
+ down_write(&key->sem);
+ err = add_existing_master_key(key->payload.data[0], secret);
+ up_write(&key->sem);
+ if (err == KEY_DEAD) {
+ /* Key being removed or needs to be removed */
+ key_invalidate(key);
+ key_put(key);
+ goto retry;
+ }
+ key_put(key);
+ }
+out_unlock:
+ mutex_unlock(&fscrypt_add_key_mutex);
+ return err;
+}
+
+/*
+ * Add a master encryption key to the filesystem, causing all files which were
+ * encrypted with it to appear "unlocked" (decrypted) when accessed.
+ *
+ * When adding a key for use by v1 encryption policies, this ioctl is
+ * privileged, and userspace must provide the 'key_descriptor'.
+ *
+ * When adding a key for use by v2+ encryption policies, this ioctl is
+ * unprivileged. This is needed, in general, to allow non-root users to use
+ * encryption without encountering the visibility problems of process-subscribed
+ * keyrings and the inability to properly remove keys. This works by having
+ * each key identified by its cryptographically secure hash --- the
+ * 'key_identifier'. The cryptographic hash ensures that a malicious user
+ * cannot add the wrong key for a given identifier. Furthermore, each added key
+ * is charged to the appropriate user's quota for the keyrings service, which
+ * prevents a malicious user from adding too many keys. Finally, we forbid a
+ * user from removing a key while other users have added it too, which prevents
+ * a user who knows another user's key from causing a denial-of-service by
+ * removing it at an inopportune time. (We tolerate that a user who knows a key
+ * can prevent other users from removing it.)
+ *
+ * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fscrypt_add_key_arg __user *uarg = _uarg;
+ struct fscrypt_add_key_arg arg;
+ struct fscrypt_master_key_secret secret;
+ int err;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (!valid_key_spec(&arg.key_spec))
+ return -EINVAL;
+
+ if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
+ arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
+ return -EINVAL;
+
+ if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+ return -EINVAL;
+
+ memset(&secret, 0, sizeof(secret));
+ secret.size = arg.raw_size;
+ err = -EFAULT;
+ if (copy_from_user(secret.raw, uarg->raw, secret.size))
+ goto out_wipe_secret;
+
+ switch (arg.key_spec.type) {
+ case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+ /*
+ * Only root can add keys that are identified by an arbitrary
+ * descriptor rather than by a cryptographic hash --- since
+ * otherwise a malicious user could add the wrong key.
+ */
+ err = -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out_wipe_secret;
+ break;
+ case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+ err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
+ if (err)
+ goto out_wipe_secret;
+
+ /*
+ * Now that the HKDF context is initialized, the raw key is no
+ * longer needed.
+ */
+ memzero_explicit(secret.raw, secret.size);
+
+ /* Calculate the key identifier and return it to userspace. */
+ err = fscrypt_hkdf_expand(&secret.hkdf,
+ HKDF_CONTEXT_KEY_IDENTIFIER,
+ NULL, 0, arg.key_spec.u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+ if (err)
+ goto out_wipe_secret;
+ err = -EFAULT;
+ if (copy_to_user(uarg->key_spec.u.identifier,
+ arg.key_spec.u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE))
+ goto out_wipe_secret;
+ break;
+ default:
+ WARN_ON(1);
+ err = -EINVAL;
+ goto out_wipe_secret;
+ }
+
+ err = add_master_key(sb, &secret, &arg.key_spec);
+out_wipe_secret:
+ wipe_master_key_secret(&secret);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
+
+/*
+ * Verify that the current user has added a master key with the given identifier
+ * (returns -ENOKEY if not). This is needed to prevent a user from encrypting
+ * their files using some other user's key which they don't actually know.
+ * Cryptographically this isn't much of a problem, but the semantics of this
+ * would be a bit weird, so it's best to just forbid it.
+ *
+ * The system administrator (CAP_FOWNER) can override this, which should be
+ * enough for any use cases where encryption policies are being set using keys
+ * that were chosen ahead of time but aren't available at the moment.
+ *
+ * Note that the key may have already removed by the time this returns, but
+ * that's okay; we just care whether the key was there at some point.
+ *
+ * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code
+ */
+int fscrypt_verify_key_added(struct super_block *sb,
+ const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
+ struct fscrypt_key_specifier mk_spec;
+ struct key *key, *mk_user;
+ struct fscrypt_master_key *mk;
+ int err;
+
+ mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+ memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
+
+ key = fscrypt_find_master_key(sb, &mk_spec);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto out;
+ }
+ mk = key->payload.data[0];
+ mk_user = find_master_key_user(mk);
+ if (IS_ERR(mk_user)) {
+ err = PTR_ERR(mk_user);
+ } else {
+ key_put(mk_user);
+ err = 0;
+ }
+ key_put(key);
+out:
+ if (err == -ENOKEY && capable(CAP_FOWNER))
+ err = 0;
+ return err;
+}
+
+/*
+ * Try to evict the inode's dentries from the dentry cache. If the inode is a
+ * directory, then it can have at most one dentry; however, that dentry may be
+ * pinned by child dentries, so first try to evict the children too.
+ */
+static void shrink_dcache_inode(struct inode *inode)
+{
+ struct dentry *dentry;
+
+ if (S_ISDIR(inode->i_mode)) {
+ dentry = d_find_any_alias(inode);
+ if (dentry) {
+ shrink_dcache_parent(dentry);
+ dput(dentry);
+ }
+ }
+ d_prune_aliases(inode);
+}
+
+static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
+{
+ struct fscrypt_info *ci;
+ struct inode *inode;
+ struct inode *toput_inode = NULL;
+
+ spin_lock(&mk->mk_decrypted_inodes_lock);
+
+ list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) {
+ inode = ci->ci_inode;
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&mk->mk_decrypted_inodes_lock);
+
+ shrink_dcache_inode(inode);
+ iput(toput_inode);
+ toput_inode = inode;
+
+ spin_lock(&mk->mk_decrypted_inodes_lock);
+ }
+
+ spin_unlock(&mk->mk_decrypted_inodes_lock);
+ iput(toput_inode);
+}
+
+static int check_for_busy_inodes(struct super_block *sb,
+ struct fscrypt_master_key *mk)
+{
+ struct list_head *pos;
+ size_t busy_count = 0;
+ unsigned long ino;
+ struct dentry *dentry;
+ char _path[256];
+ char *path = NULL;
+
+ spin_lock(&mk->mk_decrypted_inodes_lock);
+
+ list_for_each(pos, &mk->mk_decrypted_inodes)
+ busy_count++;
+
+ if (busy_count == 0) {
+ spin_unlock(&mk->mk_decrypted_inodes_lock);
+ return 0;
+ }
+
+ {
+ /* select an example file to show for debugging purposes */
+ struct inode *inode =
+ list_first_entry(&mk->mk_decrypted_inodes,
+ struct fscrypt_info,
+ ci_master_key_link)->ci_inode;
+ ino = inode->i_ino;
+ dentry = d_find_alias(inode);
+ }
+ spin_unlock(&mk->mk_decrypted_inodes_lock);
+
+ if (dentry) {
+ path = dentry_path(dentry, _path, sizeof(_path));
+ dput(dentry);
+ }
+ if (IS_ERR_OR_NULL(path))
+ path = "(unknown)";
+
+ fscrypt_warn(NULL,
+ "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)",
+ sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
+ master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
+ ino, path);
+ return -EBUSY;
+}
+
+static int try_to_lock_encrypted_files(struct super_block *sb,
+ struct fscrypt_master_key *mk)
+{
+ int err1;
+ int err2;
+
+ /*
+ * An inode can't be evicted while it is dirty or has dirty pages.
+ * Thus, we first have to clean the inodes in ->mk_decrypted_inodes.
+ *
+ * Just do it the easy way: call sync_filesystem(). It's overkill, but
+ * it works, and it's more important to minimize the amount of caches we
+ * drop than the amount of data we sync. Also, unprivileged users can
+ * already call sync_filesystem() via sys_syncfs() or sys_sync().
+ */
+ down_read(&sb->s_umount);
+ err1 = sync_filesystem(sb);
+ up_read(&sb->s_umount);
+ /* If a sync error occurs, still try to evict as much as possible. */
+
+ /*
+ * Inodes are pinned by their dentries, so we have to evict their
+ * dentries. shrink_dcache_sb() would suffice, but would be overkill
+ * and inappropriate for use by unprivileged users. So instead go
+ * through the inodes' alias lists and try to evict each dentry.
+ */
+ evict_dentries_for_decrypted_inodes(mk);
+
+ /*
+ * evict_dentries_for_decrypted_inodes() already iput() each inode in
+ * the list; any inodes for which that dropped the last reference will
+ * have been evicted due to fscrypt_drop_inode() detecting the key
+ * removal and telling the VFS to evict the inode. So to finish, we
+ * just need to check whether any inodes couldn't be evicted.
+ */
+ err2 = check_for_busy_inodes(sb, mk);
+
+ return err1 ?: err2;
+}
+
+/*
+ * Try to remove an fscrypt master encryption key.
+ *
+ * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's
+ * claim to the key, then removes the key itself if no other users have claims.
+ * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the
+ * key itself.
+ *
+ * To "remove the key itself", first we wipe the actual master key secret, so
+ * that no more inodes can be unlocked with it. Then we try to evict all cached
+ * inodes that had been unlocked with the key.
+ *
+ * If all inodes were evicted, then we unlink the fscrypt_master_key from the
+ * keyring. Otherwise it remains in the keyring in the "incompletely removed"
+ * state (without the actual secret key) where it tracks the list of remaining
+ * inodes. Userspace can execute the ioctl again later to retry eviction, or
+ * alternatively can re-add the secret key again.
+ *
+ * For more details, see the "Removing keys" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fscrypt_remove_key_arg __user *uarg = _uarg;
+ struct fscrypt_remove_key_arg arg;
+ struct key *key;
+ struct fscrypt_master_key *mk;
+ u32 status_flags = 0;
+ int err;
+ bool dead;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (!valid_key_spec(&arg.key_spec))
+ return -EINVAL;
+
+ if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+ return -EINVAL;
+
+ /*
+ * Only root can add and remove keys that are identified by an arbitrary
+ * descriptor rather than by a cryptographic hash.
+ */
+ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+ !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ /* Find the key being removed. */
+ key = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ mk = key->payload.data[0];
+
+ down_write(&key->sem);
+
+ /* If relevant, remove current user's (or all users) claim to the key */
+ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
+ if (all_users)
+ err = keyring_clear(mk->mk_users);
+ else
+ err = remove_master_key_user(mk);
+ if (err) {
+ up_write(&key->sem);
+ goto out_put_key;
+ }
+ if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
+ /*
+ * Other users have still added the key too. We removed
+ * the current user's claim to the key, but we still
+ * can't remove the key itself.
+ */
+ status_flags |=
+ FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
+ err = 0;
+ up_write(&key->sem);
+ goto out_put_key;
+ }
+ }
+
+ /* No user claims remaining. Go ahead and wipe the secret. */
+ dead = false;
+ if (is_master_key_secret_present(&mk->mk_secret)) {
+ down_write(&mk->mk_secret_sem);
+ wipe_master_key_secret(&mk->mk_secret);
+ dead = refcount_dec_and_test(&mk->mk_refcount);
+ up_write(&mk->mk_secret_sem);
+ }
+ up_write(&key->sem);
+ if (dead) {
+ /*
+ * No inodes reference the key, and we wiped the secret, so the
+ * key object is free to be removed from the keyring.
+ */
+ key_invalidate(key);
+ err = 0;
+ } else {
+ /* Some inodes still reference this key; try to evict them. */
+ err = try_to_lock_encrypted_files(sb, mk);
+ if (err == -EBUSY) {
+ status_flags |=
+ FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY;
+ err = 0;
+ }
+ }
+ /*
+ * We return 0 if we successfully did something: removed a claim to the
+ * key, wiped the secret, or tried locking the files again. Users need
+ * to check the informational status flags if they care whether the key
+ * has been fully removed including all files locked.
+ */
+out_put_key:
+ key_put(key);
+ if (err == 0)
+ err = put_user(status_flags, &uarg->removal_status_flags);
+ return err;
+}
+
+int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg)
+{
+ return do_remove_key(filp, uarg, false);
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key);
+
+int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ return do_remove_key(filp, uarg, true);
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users);
+
+/*
+ * Retrieve the status of an fscrypt master encryption key.
+ *
+ * We set ->status to indicate whether the key is absent, present, or
+ * incompletely removed. "Incompletely removed" means that the master key
+ * secret has been removed, but some files which had been unlocked with it are
+ * still in use. This field allows applications to easily determine the state
+ * of an encrypted directory without using a hack such as trying to open a
+ * regular file in it (which can confuse the "incompletely removed" state with
+ * absent or present).
+ *
+ * In addition, for v2 policy keys we allow applications to determine, via
+ * ->status_flags and ->user_count, whether the key has been added by the
+ * current user, by other users, or by both. Most applications should not need
+ * this, since ordinarily only one user should know a given key. However, if a
+ * secret key is shared by multiple users, applications may wish to add an
+ * already-present key to prevent other users from removing it. This ioctl can
+ * be used to check whether that really is the case before the work is done to
+ * add the key --- which might e.g. require prompting the user for a passphrase.
+ *
+ * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of
+ * Documentation/filesystems/fscrypt.rst.
+ */
+int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fscrypt_get_key_status_arg arg;
+ struct key *key;
+ struct fscrypt_master_key *mk;
+ int err;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (!valid_key_spec(&arg.key_spec))
+ return -EINVAL;
+
+ if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
+ return -EINVAL;
+
+ arg.status_flags = 0;
+ arg.user_count = 0;
+ memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
+
+ key = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (IS_ERR(key)) {
+ if (key != ERR_PTR(-ENOKEY))
+ return PTR_ERR(key);
+ arg.status = FSCRYPT_KEY_STATUS_ABSENT;
+ err = 0;
+ goto out;
+ }
+ mk = key->payload.data[0];
+ down_read(&key->sem);
+
+ if (!is_master_key_secret_present(&mk->mk_secret)) {
+ arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+ err = 0;
+ goto out_release_key;
+ }
+
+ arg.status = FSCRYPT_KEY_STATUS_PRESENT;
+ if (mk->mk_users) {
+ struct key *mk_user;
+
+ arg.user_count = mk->mk_users->keys.nr_leaves_on_tree;
+ mk_user = find_master_key_user(mk);
+ if (!IS_ERR(mk_user)) {
+ arg.status_flags |=
+ FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF;
+ key_put(mk_user);
+ } else if (mk_user != ERR_PTR(-ENOKEY)) {
+ err = PTR_ERR(mk_user);
+ goto out_release_key;
+ }
+ }
+ err = 0;
+out_release_key:
+ up_read(&key->sem);
+ key_put(key);
+out:
+ if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
+ err = -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status);
+
+int __init fscrypt_init_keyring(void)
+{
+ int err;
+
+ err = register_key_type(&key_type_fscrypt);
+ if (err)
+ return err;
+
+ err = register_key_type(&key_type_fscrypt_user);
+ if (err)
+ goto err_unregister_fscrypt;
+
+ return 0;
+
+err_unregister_fscrypt:
+ unregister_key_type(&key_type_fscrypt);
+ return err;
+}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
new file mode 100644
index 000000000000..d71c2d6dd162
--- /dev/null
+++ b/fs/crypto/keysetup.c
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
+ * Heavily modified since then.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/sha.h>
+#include <crypto/skcipher.h>
+#include <linux/key.h>
+
+#include "fscrypt_private.h"
+
+static struct crypto_shash *essiv_hash_tfm;
+
+static struct fscrypt_mode available_modes[] = {
+ [FSCRYPT_MODE_AES_256_XTS] = {
+ .friendly_name = "AES-256-XTS",
+ .cipher_str = "xts(aes)",
+ .keysize = 64,
+ .ivsize = 16,
+ },
+ [FSCRYPT_MODE_AES_256_CTS] = {
+ .friendly_name = "AES-256-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 32,
+ .ivsize = 16,
+ },
+ [FSCRYPT_MODE_AES_128_CBC] = {
+ .friendly_name = "AES-128-CBC",
+ .cipher_str = "cbc(aes)",
+ .keysize = 16,
+ .ivsize = 16,
+ .needs_essiv = true,
+ },
+ [FSCRYPT_MODE_AES_128_CTS] = {
+ .friendly_name = "AES-128-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 16,
+ .ivsize = 16,
+ },
+ [FSCRYPT_MODE_ADIANTUM] = {
+ .friendly_name = "Adiantum",
+ .cipher_str = "adiantum(xchacha12,aes)",
+ .keysize = 32,
+ .ivsize = 32,
+ },
+};
+
+static struct fscrypt_mode *
+select_encryption_mode(const union fscrypt_policy *policy,
+ const struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return &available_modes[fscrypt_policy_contents_mode(policy)];
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return &available_modes[fscrypt_policy_fnames_mode(policy)];
+
+ WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+ inode->i_ino, (inode->i_mode & S_IFMT));
+ return ERR_PTR(-EINVAL);
+}
+
+/* Create a symmetric cipher object for the given encryption mode and key */
+struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
+ const u8 *raw_key,
+ const struct inode *inode)
+{
+ struct crypto_skcipher *tfm;
+ int err;
+
+ tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ if (IS_ERR(tfm)) {
+ if (PTR_ERR(tfm) == -ENOENT) {
+ fscrypt_warn(inode,
+ "Missing crypto API support for %s (API name: \"%s\")",
+ mode->friendly_name, mode->cipher_str);
+ return ERR_PTR(-ENOPKG);
+ }
+ fscrypt_err(inode, "Error allocating '%s' transform: %ld",
+ mode->cipher_str, PTR_ERR(tfm));
+ return tfm;
+ }
+ if (unlikely(!mode->logged_impl_name)) {
+ /*
+ * fscrypt performance can vary greatly depending on which
+ * crypto algorithm implementation is used. Help people debug
+ * performance problems by logging the ->cra_driver_name the
+ * first time a mode is used. Note that multiple threads can
+ * race here, but it doesn't really matter.
+ */
+ mode->logged_impl_name = true;
+ pr_info("fscrypt: %s using implementation \"%s\"\n",
+ mode->friendly_name,
+ crypto_skcipher_alg(tfm)->base.cra_driver_name);
+ }
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+ if (err)
+ goto err_free_tfm;
+
+ return tfm;
+
+err_free_tfm:
+ crypto_free_skcipher(tfm);
+ return ERR_PTR(err);
+}
+
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+ struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+ /* init hash transform on demand */
+ if (unlikely(!tfm)) {
+ struct crypto_shash *prev_tfm;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ if (PTR_ERR(tfm) == -ENOENT) {
+ fscrypt_warn(NULL,
+ "Missing crypto API support for SHA-256");
+ return -ENOPKG;
+ }
+ fscrypt_err(NULL,
+ "Error allocating SHA-256 transform: %ld",
+ PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+ prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+ if (prev_tfm) {
+ crypto_free_shash(tfm);
+ tfm = prev_tfm;
+ }
+ }
+
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+ desc->tfm = tfm;
+
+ return crypto_shash_digest(desc, key, keysize, salt);
+ }
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+ int keysize)
+{
+ int err;
+ struct crypto_cipher *essiv_tfm;
+ u8 salt[SHA256_DIGEST_SIZE];
+
+ if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE))
+ return -EINVAL;
+
+ essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(essiv_tfm))
+ return PTR_ERR(essiv_tfm);
+
+ ci->ci_essiv_tfm = essiv_tfm;
+
+ err = derive_essiv_salt(raw_key, keysize, salt);
+ if (err)
+ goto out;
+
+ /*
+ * Using SHA256 to derive the salt/key will result in AES-256 being
+ * used for IV generation. File contents encryption will still use the
+ * configured keysize (AES-128) nevertheless.
+ */
+ err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+ if (err)
+ goto out;
+
+out:
+ memzero_explicit(salt, sizeof(salt));
+ return err;
+}
+
+/* Given the per-file key, set up the file's crypto transform object(s) */
+int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
+{
+ struct fscrypt_mode *mode = ci->ci_mode;
+ struct crypto_skcipher *ctfm;
+ int err;
+
+ ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode);
+ if (IS_ERR(ctfm))
+ return PTR_ERR(ctfm);
+
+ ci->ci_ctfm = ctfm;
+
+ if (mode->needs_essiv) {
+ err = init_essiv_generator(ci, derived_key, mode->keysize);
+ if (err) {
+ fscrypt_warn(ci->ci_inode,
+ "Error initializing ESSIV generator: %d",
+ err);
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int setup_per_mode_key(struct fscrypt_info *ci,
+ struct fscrypt_master_key *mk)
+{
+ struct fscrypt_mode *mode = ci->ci_mode;
+ u8 mode_num = mode - available_modes;
+ struct crypto_skcipher *tfm, *prev_tfm;
+ u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
+ int err;
+
+ if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys)))
+ return -EINVAL;
+
+ /* pairs with cmpxchg() below */
+ tfm = READ_ONCE(mk->mk_mode_keys[mode_num]);
+ if (likely(tfm != NULL))
+ goto done;
+
+ BUILD_BUG_ON(sizeof(mode_num) != 1);
+ err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+ HKDF_CONTEXT_PER_MODE_KEY,
+ &mode_num, sizeof(mode_num),
+ mode_key, mode->keysize);
+ if (err)
+ return err;
+ tfm = fscrypt_allocate_skcipher(mode, mode_key, ci->ci_inode);
+ memzero_explicit(mode_key, mode->keysize);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ /* pairs with READ_ONCE() above */
+ prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm);
+ if (prev_tfm != NULL) {
+ crypto_free_skcipher(tfm);
+ tfm = prev_tfm;
+ }
+done:
+ ci->ci_ctfm = tfm;
+ return 0;
+}
+
+static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+ struct fscrypt_master_key *mk)
+{
+ u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
+ int err;
+
+ if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
+ /*
+ * DIRECT_KEY: instead of deriving per-file keys, the per-file
+ * nonce will be included in all the IVs. But unlike v1
+ * policies, for v2 policies in this case we don't encrypt with
+ * the master key directly but rather derive a per-mode key.
+ * This ensures that the master key is consistently used only
+ * for HKDF, avoiding key reuse issues.
+ */
+ if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) {
+ fscrypt_warn(ci->ci_inode,
+ "Direct key flag not allowed with %s",
+ ci->ci_mode->friendly_name);
+ return -EINVAL;
+ }
+ return setup_per_mode_key(ci, mk);
+ }
+
+ err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+ HKDF_CONTEXT_PER_FILE_KEY,
+ ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
+ derived_key, ci->ci_mode->keysize);
+ if (err)
+ return err;
+
+ err = fscrypt_set_derived_key(ci, derived_key);
+ memzero_explicit(derived_key, ci->ci_mode->keysize);
+ return err;
+}
+
+/*
+ * Find the master key, then set up the inode's actual encryption key.
+ *
+ * If the master key is found in the filesystem-level keyring, then the
+ * corresponding 'struct key' is returned in *master_key_ret with
+ * ->mk_secret_sem read-locked. This is needed to ensure that only one task
+ * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race
+ * to create an fscrypt_info for the same inode), and to synchronize the master
+ * key being removed with a new inode starting to use it.
+ */
+static int setup_file_encryption_key(struct fscrypt_info *ci,
+ struct key **master_key_ret)
+{
+ struct key *key;
+ struct fscrypt_master_key *mk = NULL;
+ struct fscrypt_key_specifier mk_spec;
+ int err;
+
+ switch (ci->ci_policy.version) {
+ case FSCRYPT_POLICY_V1:
+ mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+ memcpy(mk_spec.u.descriptor,
+ ci->ci_policy.v1.master_key_descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ break;
+ case FSCRYPT_POLICY_V2:
+ mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+ memcpy(mk_spec.u.identifier,
+ ci->ci_policy.v2.master_key_identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+ break;
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+ if (IS_ERR(key)) {
+ if (key != ERR_PTR(-ENOKEY) ||
+ ci->ci_policy.version != FSCRYPT_POLICY_V1)
+ return PTR_ERR(key);
+
+ /*
+ * As a legacy fallback for v1 policies, search for the key in
+ * the current task's subscribed keyrings too. Don't move this
+ * to before the search of ->s_master_keys, since users
+ * shouldn't be able to override filesystem-level keys.
+ */
+ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
+ }
+
+ mk = key->payload.data[0];
+ down_read(&mk->mk_secret_sem);
+
+ /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
+ if (!is_master_key_secret_present(&mk->mk_secret)) {
+ err = -ENOKEY;
+ goto out_release_key;
+ }
+
+ /*
+ * Require that the master key be at least as long as the derived key.
+ * Otherwise, the derived key cannot possibly contain as much entropy as
+ * that required by the encryption mode it will be used for. For v1
+ * policies it's also required for the KDF to work at all.
+ */
+ if (mk->mk_secret.size < ci->ci_mode->keysize) {
+ fscrypt_warn(NULL,
+ "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+ master_key_spec_type(&mk_spec),
+ master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
+ mk->mk_secret.size, ci->ci_mode->keysize);
+ err = -ENOKEY;
+ goto out_release_key;
+ }
+
+ switch (ci->ci_policy.version) {
+ case FSCRYPT_POLICY_V1:
+ err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
+ break;
+ case FSCRYPT_POLICY_V2:
+ err = fscrypt_setup_v2_file_key(ci, mk);
+ break;
+ default:
+ WARN_ON(1);
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ goto out_release_key;
+
+ *master_key_ret = key;
+ return 0;
+
+out_release_key:
+ up_read(&mk->mk_secret_sem);
+ key_put(key);
+ return err;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+ struct key *key;
+
+ if (!ci)
+ return;
+
+ if (ci->ci_direct_key) {
+ fscrypt_put_direct_key(ci->ci_direct_key);
+ } else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) &&
+ !fscrypt_is_direct_key_policy(&ci->ci_policy)) {
+ crypto_free_skcipher(ci->ci_ctfm);
+ crypto_free_cipher(ci->ci_essiv_tfm);
+ }
+
+ key = ci->ci_master_key;
+ if (key) {
+ struct fscrypt_master_key *mk = key->payload.data[0];
+
+ /*
+ * Remove this inode from the list of inodes that were unlocked
+ * with the master key.
+ *
+ * In addition, if we're removing the last inode from a key that
+ * already had its secret removed, invalidate the key so that it
+ * gets removed from ->s_master_keys.
+ */
+ spin_lock(&mk->mk_decrypted_inodes_lock);
+ list_del(&ci->ci_master_key_link);
+ spin_unlock(&mk->mk_decrypted_inodes_lock);
+ if (refcount_dec_and_test(&mk->mk_refcount))
+ key_invalidate(key);
+ key_put(key);
+ }
+ kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ struct fscrypt_info *crypt_info;
+ union fscrypt_context ctx;
+ struct fscrypt_mode *mode;
+ struct key *master_key = NULL;
+ int res;
+
+ if (fscrypt_has_encryption_key(inode))
+ return 0;
+
+ res = fscrypt_initialize(inode->i_sb->s_cop->flags);
+ if (res)
+ return res;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ if (!fscrypt_dummy_context_enabled(inode) ||
+ IS_ENCRYPTED(inode)) {
+ fscrypt_warn(inode,
+ "Error %d getting encryption context",
+ res);
+ return res;
+ }
+ /* Fake up a context for an unencrypted directory */
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.version = FSCRYPT_CONTEXT_V1;
+ ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memset(ctx.v1.master_key_descriptor, 0x42,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ res = sizeof(ctx.v1);
+ }
+
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_inode = inode;
+
+ res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
+ if (res) {
+ fscrypt_warn(inode,
+ "Unrecognized or corrupt encryption context");
+ goto out;
+ }
+
+ switch (ctx.version) {
+ case FSCRYPT_CONTEXT_V1:
+ memcpy(crypt_info->ci_nonce, ctx.v1.nonce,
+ FS_KEY_DERIVATION_NONCE_SIZE);
+ break;
+ case FSCRYPT_CONTEXT_V2:
+ memcpy(crypt_info->ci_nonce, ctx.v2.nonce,
+ FS_KEY_DERIVATION_NONCE_SIZE);
+ break;
+ default:
+ WARN_ON(1);
+ res = -EINVAL;
+ goto out;
+ }
+
+ if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
+ res = -EINVAL;
+ goto out;
+ }
+
+ mode = select_encryption_mode(&crypt_info->ci_policy, inode);
+ if (IS_ERR(mode)) {
+ res = PTR_ERR(mode);
+ goto out;
+ }
+ WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
+ crypt_info->ci_mode = mode;
+
+ res = setup_file_encryption_key(crypt_info, &master_key);
+ if (res)
+ goto out;
+
+ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+ if (master_key) {
+ struct fscrypt_master_key *mk =
+ master_key->payload.data[0];
+
+ refcount_inc(&mk->mk_refcount);
+ crypt_info->ci_master_key = key_get(master_key);
+ spin_lock(&mk->mk_decrypted_inodes_lock);
+ list_add(&crypt_info->ci_master_key_link,
+ &mk->mk_decrypted_inodes);
+ spin_unlock(&mk->mk_decrypted_inodes_lock);
+ }
+ crypt_info = NULL;
+ }
+ res = 0;
+out:
+ if (master_key) {
+ struct fscrypt_master_key *mk = master_key->payload.data[0];
+
+ up_read(&mk->mk_secret_sem);
+ key_put(master_key);
+ }
+ if (res == -ENOKEY)
+ res = 0;
+ put_crypt_info(crypt_info);
+ return res;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
+
+/**
+ * fscrypt_put_encryption_info - free most of an inode's fscrypt data
+ *
+ * Free the inode's fscrypt_info. Filesystems must call this when the inode is
+ * being evicted. An RCU grace period need not have elapsed yet.
+ */
+void fscrypt_put_encryption_info(struct inode *inode)
+{
+ put_crypt_info(inode->i_crypt_info);
+ inode->i_crypt_info = NULL;
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+
+/**
+ * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay
+ *
+ * Free the inode's cached decrypted symlink target, if any. Filesystems must
+ * call this after an RCU grace period, just before they free the inode.
+ */
+void fscrypt_free_inode(struct inode *inode)
+{
+ if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) {
+ kfree(inode->i_link);
+ inode->i_link = NULL;
+ }
+}
+EXPORT_SYMBOL(fscrypt_free_inode);
+
+/**
+ * fscrypt_drop_inode - check whether the inode's master key has been removed
+ *
+ * Filesystems supporting fscrypt must call this from their ->drop_inode()
+ * method so that encrypted inodes are evicted as soon as they're no longer in
+ * use and their master key has been removed.
+ *
+ * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0
+ */
+int fscrypt_drop_inode(struct inode *inode)
+{
+ const struct fscrypt_info *ci = READ_ONCE(inode->i_crypt_info);
+ const struct fscrypt_master_key *mk;
+
+ /*
+ * If ci is NULL, then the inode doesn't have an encryption key set up
+ * so it's irrelevant. If ci_master_key is NULL, then the master key
+ * was provided via the legacy mechanism of the process-subscribed
+ * keyrings, so we don't know whether it's been removed or not.
+ */
+ if (!ci || !ci->ci_master_key)
+ return 0;
+ mk = ci->ci_master_key->payload.data[0];
+
+ /*
+ * Note: since we aren't holding ->mk_secret_sem, the result here can
+ * immediately become outdated. But there's no correctness problem with
+ * unnecessarily evicting. Nor is there a correctness problem with not
+ * evicting while iput() is racing with the key being removed, since
+ * then the thread removing the key will either evict the inode itself
+ * or will correctly detect that it wasn't evicted due to the race.
+ */
+ return !is_master_key_secret_present(&mk->mk_secret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
new file mode 100644
index 000000000000..ad1a36c370c3
--- /dev/null
+++ b/fs/crypto/keysetup_v1.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Key setup for v1 encryption policies
+ *
+ * Copyright 2015, 2019 Google LLC
+ */
+
+/*
+ * This file implements compatibility functions for the original encryption
+ * policy version ("v1"), including:
+ *
+ * - Deriving per-file keys using the AES-128-ECB based KDF
+ * (rather than the new method of using HKDF-SHA512)
+ *
+ * - Retrieving fscrypt master keys from process-subscribed keyrings
+ * (rather than the new method of using a filesystem-level keyring)
+ *
+ * - Handling policies with the DIRECT_KEY flag set using a master key table
+ * (rather than the new method of implementing DIRECT_KEY with per-mode keys
+ * managed alongside the master keys in the filesystem-level keyring)
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+#include <linux/hashtable.h>
+#include <linux/scatterlist.h>
+
+#include "fscrypt_private.h"
+
+/* Table of keys referenced by DIRECT_KEY policies */
+static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_direct_keys_lock);
+
+/*
+ * v1 key derivation function. This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the nonce as the AES key. This provides a
+ * unique derived key with sufficient entropy for each inode. However, it's
+ * nonstandard, non-extensible, doesn't evenly distribute the entropy from the
+ * master key, and is trivially reversible: an attacker who compromises a
+ * derived key can "decrypt" it to get back to the master key, then derive any
+ * other key. For all new code, use HKDF instead.
+ *
+ * The master key must be at least as long as the derived key. If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
+ */
+static int derive_key_aes(const u8 *master_key,
+ const u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE],
+ u8 *derived_key, unsigned int derived_keysize)
+{
+ int res = 0;
+ struct skcipher_request *req = NULL;
+ DECLARE_CRYPTO_WAIT(wait);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ skcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ crypto_req_done, &wait);
+ res = crypto_skcipher_setkey(tfm, nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ if (res < 0)
+ goto out;
+
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
+ NULL);
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+out:
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm);
+ return res;
+}
+
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+ const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE],
+ unsigned int min_keysize,
+ const struct fscrypt_key **payload_ret)
+{
+ char *description;
+ struct key *key;
+ const struct user_key_payload *ukp;
+ const struct fscrypt_key *payload;
+
+ description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
+ if (!description)
+ return ERR_PTR(-ENOMEM);
+
+ key = request_key(&key_type_logon, description, NULL);
+ kfree(description);
+ if (IS_ERR(key))
+ return key;
+
+ down_read(&key->sem);
+ ukp = user_key_payload_locked(key);
+
+ if (!ukp) /* was the key revoked before we acquired its semaphore? */
+ goto invalid;
+
+ payload = (const struct fscrypt_key *)ukp->data;
+
+ if (ukp->datalen != sizeof(struct fscrypt_key) ||
+ payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
+ fscrypt_warn(NULL,
+ "key with description '%s' has invalid payload",
+ key->description);
+ goto invalid;
+ }
+
+ if (payload->size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+ key->description, payload->size, min_keysize);
+ goto invalid;
+ }
+
+ *payload_ret = payload;
+ return key;
+
+invalid:
+ up_read(&key->sem);
+ key_put(key);
+ return ERR_PTR(-ENOKEY);
+}
+
+/* Master key referenced by DIRECT_KEY policy */
+struct fscrypt_direct_key {
+ struct hlist_node dk_node;
+ refcount_t dk_refcount;
+ const struct fscrypt_mode *dk_mode;
+ struct crypto_skcipher *dk_ctfm;
+ u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
+ u8 dk_raw[FSCRYPT_MAX_KEY_SIZE];
+};
+
+static void free_direct_key(struct fscrypt_direct_key *dk)
+{
+ if (dk) {
+ crypto_free_skcipher(dk->dk_ctfm);
+ kzfree(dk);
+ }
+}
+
+void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
+{
+ if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock))
+ return;
+ hash_del(&dk->dk_node);
+ spin_unlock(&fscrypt_direct_keys_lock);
+
+ free_direct_key(dk);
+}
+
+/*
+ * Find/insert the given key into the fscrypt_direct_keys table. If found, it
+ * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If
+ * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise
+ * NULL is returned.
+ */
+static struct fscrypt_direct_key *
+find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
+ const u8 *raw_key, const struct fscrypt_info *ci)
+{
+ unsigned long hash_key;
+ struct fscrypt_direct_key *dk;
+
+ /*
+ * Careful: to avoid potentially leaking secret key bytes via timing
+ * information, we must key the hash table by descriptor rather than by
+ * raw key, and use crypto_memneq() when comparing raw keys.
+ */
+
+ BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor,
+ sizeof(hash_key));
+
+ spin_lock(&fscrypt_direct_keys_lock);
+ hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) {
+ if (memcmp(ci->ci_policy.v1.master_key_descriptor,
+ dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0)
+ continue;
+ if (ci->ci_mode != dk->dk_mode)
+ continue;
+ if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize))
+ continue;
+ /* using existing tfm with same (descriptor, mode, raw_key) */
+ refcount_inc(&dk->dk_refcount);
+ spin_unlock(&fscrypt_direct_keys_lock);
+ free_direct_key(to_insert);
+ return dk;
+ }
+ if (to_insert)
+ hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key);
+ spin_unlock(&fscrypt_direct_keys_lock);
+ return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct fscrypt_direct_key *
+fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+{
+ struct fscrypt_direct_key *dk;
+ int err;
+
+ /* Is there already a tfm for this key? */
+ dk = find_or_insert_direct_key(NULL, raw_key, ci);
+ if (dk)
+ return dk;
+
+ /* Nope, allocate one. */
+ dk = kzalloc(sizeof(*dk), GFP_NOFS);
+ if (!dk)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&dk->dk_refcount, 1);
+ dk->dk_mode = ci->ci_mode;
+ dk->dk_ctfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key,
+ ci->ci_inode);
+ if (IS_ERR(dk->dk_ctfm)) {
+ err = PTR_ERR(dk->dk_ctfm);
+ dk->dk_ctfm = NULL;
+ goto err_free_dk;
+ }
+ memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize);
+
+ return find_or_insert_direct_key(dk, raw_key, ci);
+
+err_free_dk:
+ free_direct_key(dk);
+ return ERR_PTR(err);
+}
+
+/* v1 policy, DIRECT_KEY: use the master key directly */
+static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+ const u8 *raw_master_key)
+{
+ const struct fscrypt_mode *mode = ci->ci_mode;
+ struct fscrypt_direct_key *dk;
+
+ if (!fscrypt_mode_supports_direct_key(mode)) {
+ fscrypt_warn(ci->ci_inode,
+ "Direct key mode not allowed with %s",
+ mode->friendly_name);
+ return -EINVAL;
+ }
+
+ if (ci->ci_policy.v1.contents_encryption_mode !=
+ ci->ci_policy.v1.filenames_encryption_mode) {
+ fscrypt_warn(ci->ci_inode,
+ "Direct key mode not allowed with different contents and filenames modes");
+ return -EINVAL;
+ }
+
+ /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+ if (WARN_ON(mode->needs_essiv))
+ return -EINVAL;
+
+ dk = fscrypt_get_direct_key(ci, raw_master_key);
+ if (IS_ERR(dk))
+ return PTR_ERR(dk);
+ ci->ci_direct_key = dk;
+ ci->ci_ctfm = dk->dk_ctfm;
+ return 0;
+}
+
+/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
+static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+ const u8 *raw_master_key)
+{
+ u8 *derived_key;
+ int err;
+
+ /*
+ * This cannot be a stack buffer because it will be passed to the
+ * scatterlist crypto API during derive_key_aes().
+ */
+ derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+ if (!derived_key)
+ return -ENOMEM;
+
+ err = derive_key_aes(raw_master_key, ci->ci_nonce,
+ derived_key, ci->ci_mode->keysize);
+ if (err)
+ goto out;
+
+ err = fscrypt_set_derived_key(ci, derived_key);
+out:
+ kzfree(derived_key);
+ return err;
+}
+
+int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+{
+ if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
+ return setup_v1_file_key_direct(ci, raw_master_key);
+ else
+ return setup_v1_file_key_derived(ci, raw_master_key);
+}
+
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+{
+ struct key *key;
+ const struct fscrypt_key *payload;
+ int err;
+
+ key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
+ ci->ci_policy.v1.master_key_descriptor,
+ ci->ci_mode->keysize, &payload);
+ if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
+ key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+ ci->ci_policy.v1.master_key_descriptor,
+ ci->ci_mode->keysize, &payload);
+ }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ err = fscrypt_setup_v1_file_key(ci, payload->raw);
+ up_read(&key->sem);
+ key_put(key);
+ return err;
+}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4941fe8471ce..4072ba644595 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -5,8 +5,9 @@
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility.
*
- * Written by Michael Halcrow, 2015.
+ * Originally written by Michael Halcrow, 2015.
* Modified by Jaegeuk Kim, 2015.
+ * Modified by Eric Biggers, 2019 for v2 policy support.
*/
#include <linux/random.h>
@@ -14,70 +15,303 @@
#include <linux/mount.h>
#include "fscrypt_private.h"
-/*
- * check whether an encryption policy is consistent with an encryption context
+/**
+ * fscrypt_policies_equal - check whether two encryption policies are the same
+ *
+ * Return: %true if equal, else %false
+ */
+bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
+ const union fscrypt_policy *policy2)
+{
+ if (policy1->version != policy2->version)
+ return false;
+
+ return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
+}
+
+/**
+ * fscrypt_supported_policy - check whether an encryption policy is supported
+ *
+ * Given an encryption policy, check whether all its encryption modes and other
+ * settings are supported by this kernel. (But we don't currently don't check
+ * for crypto API support here, so attempting to use an algorithm not configured
+ * into the crypto API will still fail later.)
+ *
+ * Return: %true if supported, else %false
+ */
+bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
+ const struct inode *inode)
+{
+ switch (policy_u->version) {
+ case FSCRYPT_POLICY_V1: {
+ const struct fscrypt_policy_v1 *policy = &policy_u->v1;
+
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode)) {
+ fscrypt_warn(inode,
+ "Unsupported encryption modes (contents %d, filenames %d)",
+ policy->contents_encryption_mode,
+ policy->filenames_encryption_mode);
+ return false;
+ }
+
+ if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+ fscrypt_warn(inode,
+ "Unsupported encryption flags (0x%02x)",
+ policy->flags);
+ return false;
+ }
+
+ return true;
+ }
+ case FSCRYPT_POLICY_V2: {
+ const struct fscrypt_policy_v2 *policy = &policy_u->v2;
+
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode)) {
+ fscrypt_warn(inode,
+ "Unsupported encryption modes (contents %d, filenames %d)",
+ policy->contents_encryption_mode,
+ policy->filenames_encryption_mode);
+ return false;
+ }
+
+ if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+ fscrypt_warn(inode,
+ "Unsupported encryption flags (0x%02x)",
+ policy->flags);
+ return false;
+ }
+
+ if (memchr_inv(policy->__reserved, 0,
+ sizeof(policy->__reserved))) {
+ fscrypt_warn(inode,
+ "Reserved bits set in encryption policy");
+ return false;
+ }
+
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy
+ *
+ * Create an fscrypt_context for an inode that is being assigned the given
+ * encryption policy. A new nonce is randomly generated.
+ *
+ * Return: the size of the new context in bytes.
*/
-static bool is_encryption_context_consistent_with_policy(
- const struct fscrypt_context *ctx,
- const struct fscrypt_policy *policy)
+static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
+ const union fscrypt_policy *policy_u)
{
- return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (ctx->flags == policy->flags) &&
- (ctx->contents_encryption_mode ==
- policy->contents_encryption_mode) &&
- (ctx->filenames_encryption_mode ==
- policy->filenames_encryption_mode);
+ memset(ctx_u, 0, sizeof(*ctx_u));
+
+ switch (policy_u->version) {
+ case FSCRYPT_POLICY_V1: {
+ const struct fscrypt_policy_v1 *policy = &policy_u->v1;
+ struct fscrypt_context_v1 *ctx = &ctx_u->v1;
+
+ ctx->version = FSCRYPT_CONTEXT_V1;
+ ctx->contents_encryption_mode =
+ policy->contents_encryption_mode;
+ ctx->filenames_encryption_mode =
+ policy->filenames_encryption_mode;
+ ctx->flags = policy->flags;
+ memcpy(ctx->master_key_descriptor,
+ policy->master_key_descriptor,
+ sizeof(ctx->master_key_descriptor));
+ get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ return sizeof(*ctx);
+ }
+ case FSCRYPT_POLICY_V2: {
+ const struct fscrypt_policy_v2 *policy = &policy_u->v2;
+ struct fscrypt_context_v2 *ctx = &ctx_u->v2;
+
+ ctx->version = FSCRYPT_CONTEXT_V2;
+ ctx->contents_encryption_mode =
+ policy->contents_encryption_mode;
+ ctx->filenames_encryption_mode =
+ policy->filenames_encryption_mode;
+ ctx->flags = policy->flags;
+ memcpy(ctx->master_key_identifier,
+ policy->master_key_identifier,
+ sizeof(ctx->master_key_identifier));
+ get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ return sizeof(*ctx);
+ }
+ }
+ BUG();
}
-static int create_encryption_context_from_policy(struct inode *inode,
- const struct fscrypt_policy *policy)
+/**
+ * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy
+ *
+ * Given an fscrypt_context, build the corresponding fscrypt_policy.
+ *
+ * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized
+ * version number or size.
+ *
+ * This does *not* validate the settings within the policy itself, e.g. the
+ * modes, flags, and reserved bits. Use fscrypt_supported_policy() for that.
+ */
+int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
+ const union fscrypt_context *ctx_u,
+ int ctx_size)
{
- struct fscrypt_context ctx;
+ memset(policy_u, 0, sizeof(*policy_u));
+
+ if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u))
+ return -EINVAL;
+
+ switch (ctx_u->version) {
+ case FSCRYPT_CONTEXT_V1: {
+ const struct fscrypt_context_v1 *ctx = &ctx_u->v1;
+ struct fscrypt_policy_v1 *policy = &policy_u->v1;
+
+ policy->version = FSCRYPT_POLICY_V1;
+ policy->contents_encryption_mode =
+ ctx->contents_encryption_mode;
+ policy->filenames_encryption_mode =
+ ctx->filenames_encryption_mode;
+ policy->flags = ctx->flags;
+ memcpy(policy->master_key_descriptor,
+ ctx->master_key_descriptor,
+ sizeof(policy->master_key_descriptor));
+ return 0;
+ }
+ case FSCRYPT_CONTEXT_V2: {
+ const struct fscrypt_context_v2 *ctx = &ctx_u->v2;
+ struct fscrypt_policy_v2 *policy = &policy_u->v2;
+
+ policy->version = FSCRYPT_POLICY_V2;
+ policy->contents_encryption_mode =
+ ctx->contents_encryption_mode;
+ policy->filenames_encryption_mode =
+ ctx->filenames_encryption_mode;
+ policy->flags = ctx->flags;
+ memcpy(policy->__reserved, ctx->__reserved,
+ sizeof(policy->__reserved));
+ memcpy(policy->master_key_identifier,
+ ctx->master_key_identifier,
+ sizeof(policy->master_key_identifier));
+ return 0;
+ }
+ }
+ /* unreachable */
+ return -EINVAL;
+}
+
+/* Retrieve an inode's encryption policy */
+static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
+{
+ const struct fscrypt_info *ci;
+ union fscrypt_context ctx;
+ int ret;
+
+ ci = READ_ONCE(inode->i_crypt_info);
+ if (ci) {
+ /* key available, use the cached policy */
+ *policy = ci->ci_policy;
+ return 0;
+ }
+
+ if (!IS_ENCRYPTED(inode))
+ return -ENODATA;
- ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
+ ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (ret < 0)
+ return (ret == -ERANGE) ? -EINVAL : ret;
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
- policy->filenames_encryption_mode))
+ return fscrypt_policy_from_context(policy, &ctx, ret);
+}
+
+static int set_encryption_policy(struct inode *inode,
+ const union fscrypt_policy *policy)
+{
+ union fscrypt_context ctx;
+ int ctxsize;
+ int err;
+
+ if (!fscrypt_supported_policy(policy, inode))
return -EINVAL;
- if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ /*
+ * The original encryption policy version provided no way of
+ * verifying that the correct master key was supplied, which was
+ * insecure in scenarios where multiple users have access to the
+ * same encrypted files (even just read-only access). The new
+ * encryption policy version fixes this and also implies use of
+ * an improved key derivation function and allows non-root users
+ * to securely remove keys. So as long as compatibility with
+ * old kernels isn't required, it is recommended to use the new
+ * policy version for all new encrypted directories.
+ */
+ pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n",
+ current->comm, current->pid);
+ break;
+ case FSCRYPT_POLICY_V2:
+ err = fscrypt_verify_key_added(inode->i_sb,
+ policy->v2.master_key_identifier);
+ if (err)
+ return err;
+ break;
+ default:
+ WARN_ON(1);
return -EINVAL;
+ }
- ctx.contents_encryption_mode = policy->contents_encryption_mode;
- ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
- ctx.flags = policy->flags;
- BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
- get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ ctxsize = fscrypt_new_context_from_policy(&ctx, policy);
- return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+ return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
}
int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
{
- struct fscrypt_policy policy;
+ union fscrypt_policy policy;
+ union fscrypt_policy existing_policy;
struct inode *inode = file_inode(filp);
+ u8 version;
+ int size;
int ret;
- struct fscrypt_context ctx;
- if (copy_from_user(&policy, arg, sizeof(policy)))
+ if (get_user(policy.version, (const u8 __user *)arg))
return -EFAULT;
+ size = fscrypt_policy_size(&policy);
+ if (size <= 0)
+ return -EINVAL;
+
+ /*
+ * We should just copy the remaining 'size - 1' bytes here, but a
+ * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to
+ * think that size can be 0 here (despite the check above!) *and* that
+ * it's a compile-time constant. Thus it would think copy_from_user()
+ * is passed compile-time constant ULONG_MAX, causing the compile-time
+ * buffer overflow check to fail, breaking the build. This only occurred
+ * when building an i386 kernel with -Os and branch profiling enabled.
+ *
+ * Work around it by just copying the first byte again...
+ */
+ version = policy.version;
+ if (copy_from_user(&policy, arg, size))
+ return -EFAULT;
+ policy.version = version;
+
if (!inode_owner_or_capable(inode))
return -EACCES;
- if (policy.version != 0)
- return -EINVAL;
-
ret = mnt_want_write_file(filp);
if (ret)
return ret;
inode_lock(inode);
- ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ ret = fscrypt_get_policy(inode, &existing_policy);
if (ret == -ENODATA) {
if (!S_ISDIR(inode->i_mode))
ret = -ENOTDIR;
@@ -86,14 +320,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
else if (!inode->i_sb->s_cop->empty_dir(inode))
ret = -ENOTEMPTY;
else
- ret = create_encryption_context_from_policy(inode,
- &policy);
- } else if (ret == sizeof(ctx) &&
- is_encryption_context_consistent_with_policy(&ctx,
- &policy)) {
- /* The file already uses the same encryption policy. */
- ret = 0;
- } else if (ret >= 0 || ret == -ERANGE) {
+ ret = set_encryption_policy(inode, &policy);
+ } else if (ret == -EINVAL ||
+ (ret == 0 && !fscrypt_policies_equal(&policy,
+ &existing_policy))) {
/* The file already uses a different encryption policy. */
ret = -EEXIST;
}
@@ -105,37 +335,57 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
}
EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
+/* Original ioctl version; can only get the original policy version */
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
- struct inode *inode = file_inode(filp);
- struct fscrypt_context ctx;
- struct fscrypt_policy policy;
- int res;
+ union fscrypt_policy policy;
+ int err;
- if (!IS_ENCRYPTED(inode))
- return -ENODATA;
+ err = fscrypt_get_policy(file_inode(filp), &policy);
+ if (err)
+ return err;
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res < 0 && res != -ERANGE)
- return res;
- if (res != sizeof(ctx))
- return -EINVAL;
- if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ if (policy.version != FSCRYPT_POLICY_V1)
return -EINVAL;
- policy.version = 0;
- policy.contents_encryption_mode = ctx.contents_encryption_mode;
- policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy.flags = ctx.flags;
- memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
-
- if (copy_to_user(arg, &policy, sizeof(policy)))
+ if (copy_to_user(arg, &policy, sizeof(policy.v1)))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
+/* Extended ioctl version; can get policies of any version */
+int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg)
+{
+ struct fscrypt_get_policy_ex_arg arg;
+ union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy;
+ size_t policy_size;
+ int err;
+
+ /* arg is policy_size, then policy */
+ BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0);
+ BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) !=
+ offsetof(typeof(arg), policy));
+ BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy));
+
+ err = fscrypt_get_policy(file_inode(filp), policy);
+ if (err)
+ return err;
+ policy_size = fscrypt_policy_size(policy);
+
+ if (copy_from_user(&arg, uarg, sizeof(arg.policy_size)))
+ return -EFAULT;
+
+ if (policy_size > arg.policy_size)
+ return -EOVERFLOW;
+ arg.policy_size = policy_size;
+
+ if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex);
+
/**
* fscrypt_has_permitted_context() - is a file's encryption policy permitted
* within its directory?
@@ -157,10 +407,8 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
- const struct fscrypt_operations *cops = parent->i_sb->s_cop;
- const struct fscrypt_info *parent_ci, *child_ci;
- struct fscrypt_context parent_ctx, child_ctx;
- int res;
+ union fscrypt_policy parent_policy, child_policy;
+ int err;
/* No restrictions on file types which are never encrypted */
if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
@@ -190,41 +438,22 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
* In any case, if an unexpected error occurs, fall back to "forbidden".
*/
- res = fscrypt_get_encryption_info(parent);
- if (res)
+ err = fscrypt_get_encryption_info(parent);
+ if (err)
return 0;
- res = fscrypt_get_encryption_info(child);
- if (res)
+ err = fscrypt_get_encryption_info(child);
+ if (err)
return 0;
- parent_ci = READ_ONCE(parent->i_crypt_info);
- child_ci = READ_ONCE(child->i_crypt_info);
-
- if (parent_ci && child_ci) {
- return memcmp(parent_ci->ci_master_key_descriptor,
- child_ci->ci_master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
- (parent_ci->ci_filename_mode ==
- child_ci->ci_filename_mode) &&
- (parent_ci->ci_flags == child_ci->ci_flags);
- }
- res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx));
- if (res != sizeof(parent_ctx))
+ err = fscrypt_get_policy(parent, &parent_policy);
+ if (err)
return 0;
- res = cops->get_context(child, &child_ctx, sizeof(child_ctx));
- if (res != sizeof(child_ctx))
+ err = fscrypt_get_policy(child, &child_policy);
+ if (err)
return 0;
- return memcmp(parent_ctx.master_key_descriptor,
- child_ctx.master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ctx.contents_encryption_mode ==
- child_ctx.contents_encryption_mode) &&
- (parent_ctx.filenames_encryption_mode ==
- child_ctx.filenames_encryption_mode) &&
- (parent_ctx.flags == child_ctx.flags);
+ return fscrypt_policies_equal(&parent_policy, &child_policy);
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
@@ -240,7 +469,8 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context);
int fscrypt_inherit_context(struct inode *parent, struct inode *child,
void *fs_data, bool preload)
{
- struct fscrypt_context ctx;
+ union fscrypt_context ctx;
+ int ctxsize;
struct fscrypt_info *ci;
int res;
@@ -252,16 +482,10 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
if (ci == NULL)
return -ENOKEY;
- ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- ctx.contents_encryption_mode = ci->ci_data_mode;
- ctx.filenames_encryption_mode = ci->ci_filename_mode;
- ctx.flags = ci->ci_flags;
- memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE);
- get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy);
+
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- res = parent->i_sb->s_cop->set_context(child, &ctx,
- sizeof(ctx), fs_data);
+ res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data);
if (res)
return res;
return preload ? fscrypt_get_encryption_info(child): 0;
diff --git a/fs/d_path.c b/fs/d_path.c
index a7d0a96b35ce..0f1fc1743302 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -116,8 +116,10 @@ restart:
vfsmnt = &mnt->mnt;
continue;
}
- if (!error)
- error = is_mounted(vfsmnt) ? 1 : 2;
+ if (is_mounted(vfsmnt) && !is_anon_ns(mnt->mnt_ns))
+ error = 1; // absolute root
+ else
+ error = 2; // detached or not attached yet
break;
}
parent = dentry->d_parent;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index beeadca23b05..42e5a766d33c 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -622,7 +622,7 @@ void devpts_pty_kill(struct dentry *dentry)
dentry->d_fsdata = NULL;
drop_nlink(dentry->d_inode);
fsnotify_unlink(d_inode(dentry->d_parent), dentry);
- d_delete(dentry);
+ d_drop(dentry);
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
new file mode 100644
index 000000000000..9d634d3a1845
--- /dev/null
+++ b/fs/erofs/Kconfig
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config EROFS_FS
+ tristate "EROFS filesystem support"
+ depends on BLOCK
+ help
+ EROFS (Enhanced Read-Only File System) is a lightweight
+ read-only file system with modern designs (eg. page-sized
+ blocks, inline xattrs/data, etc.) for scenarios which need
+ high-performance read-only requirements, e.g. Android OS
+ for mobile phones and LIVECDs.
+
+ It also provides fixed-sized output compression support,
+ which improves storage density, keeps relatively higher
+ compression ratios, which is more useful to achieve high
+ performance for embedded devices with limited memory.
+
+ If unsure, say N.
+
+config EROFS_FS_DEBUG
+ bool "EROFS debugging feature"
+ depends on EROFS_FS
+ help
+ Print debugging messages and enable more BUG_ONs which check
+ filesystem consistency and find potential issues aggressively,
+ which can be used for Android eng build, for example.
+
+ For daily use, say N.
+
+config EROFS_FS_XATTR
+ bool "EROFS extended attributes"
+ depends on EROFS_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config EROFS_FS_POSIX_ACL
+ bool "EROFS Access Control Lists"
+ depends on EROFS_FS_XATTR
+ select FS_POSIX_ACL
+ default y
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N.
+
+config EROFS_FS_SECURITY
+ bool "EROFS Security Labels"
+ depends on EROFS_FS_XATTR
+ default y
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the erofs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
+
+config EROFS_FS_ZIP
+ bool "EROFS Data Compression Support"
+ depends on EROFS_FS
+ select LZ4_DECOMPRESS
+ default y
+ help
+ Enable fixed-sized output compression for EROFS.
+
+ If you don't want to enable compression feature, say N.
+
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+ int "EROFS Cluster Pages Hard Limit"
+ depends on EROFS_FS_ZIP
+ range 1 256
+ default "1"
+ help
+ Indicates maximum # of pages of a compressed
+ physical cluster.
+
+ For example, if files in a image were compressed
+ into 8k-unit, hard limit should not be configured
+ less than 2. Otherwise, the image will be refused
+ to mount on this kernel.
+
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
new file mode 100644
index 000000000000..46f2aa4ba46c
--- /dev/null
+++ b/fs/erofs/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+EROFS_VERSION = "1.0"
+
+ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+
+obj-$(CONFIG_EROFS_FS) += erofs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
new file mode 100644
index 000000000000..07d279fd5d67
--- /dev/null
+++ b/fs/erofs/compress.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_COMPRESS_H
+#define __EROFS_FS_COMPRESS_H
+
+#include "internal.h"
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
+
+struct z_erofs_decompress_req {
+ struct super_block *sb;
+ struct page **in, **out;
+
+ unsigned short pageofs_out;
+ unsigned int inputsize, outputsize;
+
+ /* indicate the algorithm will be used for decompression */
+ unsigned int alg;
+ bool inplace_io, partial_decoding;
+};
+
+/*
+ * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
+ * used to mark temporary allocated pages from other
+ * file/cached pages and NULL mapping pages.
+ */
+#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D)
+
+/* check if a page is marked as staging */
+static inline bool z_erofs_page_is_staging(struct page *page)
+{
+ return page->mapping == Z_EROFS_MAPPING_STAGING;
+}
+
+static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
+ struct page *page)
+{
+ if (!z_erofs_page_is_staging(page))
+ return false;
+
+ /* staging pages should not be used by others at the same time */
+ if (page_ref_count(page) > 1)
+ put_page(page);
+ else
+ list_add(&page->lru, pagepool);
+ return true;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool);
+
+#endif
+
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
new file mode 100644
index 000000000000..8a9fcbd0e8ac
--- /dev/null
+++ b/fs/erofs/data.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/prefetch.h>
+
+#include <trace/events/erofs.h>
+
+static void erofs_readendio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ blk_status_t err = bio->bi_status;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ /* page is already locked */
+ DBG_BUGON(PageUptodate(page));
+
+ if (err)
+ SetPageError(page);
+ else
+ SetPageUptodate(page);
+
+ unlock_page(page);
+ /* page could be reclaimed now */
+ }
+ bio_put(bio);
+}
+
+struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
+{
+ struct inode *const bd_inode = sb->s_bdev->bd_inode;
+ struct address_space *const mapping = bd_inode->i_mapping;
+
+ return read_cache_page_gfp(mapping, blkaddr,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+}
+
+static int erofs_map_blocks_flatmode(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
+{
+ int err = 0;
+ erofs_blk_t nblocks, lastblk;
+ u64 offset = map->m_la;
+ struct erofs_inode *vi = EROFS_I(inode);
+ bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+
+ trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
+
+ nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+ lastblk = nblocks - tailendpacking;
+
+ if (offset >= inode->i_size) {
+ /* leave out-of-bound access unmapped */
+ map->m_flags = 0;
+ map->m_plen = 0;
+ goto out;
+ }
+
+ /* there is no hole in flatmode */
+ map->m_flags = EROFS_MAP_MAPPED;
+
+ if (offset < blknr_to_addr(lastblk)) {
+ map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
+ map->m_plen = blknr_to_addr(lastblk) - offset;
+ } else if (tailendpacking) {
+ /* 2 - inode inline B: inode, [xattrs], inline last blk... */
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+
+ map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(map->m_la);
+ map->m_plen = inode->i_size - offset;
+
+ /* inline data should be located in one meta block */
+ if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+ erofs_err(inode->i_sb,
+ "inline data cross block boundary @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
+
+ map->m_flags |= EROFS_MAP_META;
+ } else {
+ erofs_err(inode->i_sb,
+ "internal error @ nid: %llu (size %llu), m_la 0x%llx",
+ vi->nid, inode->i_size, map->m_la);
+ DBG_BUGON(1);
+ err = -EIO;
+ goto err_out;
+ }
+
+out:
+ map->m_llen = map->m_plen;
+
+err_out:
+ trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
+ return err;
+}
+
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
+ int err = z_erofs_map_blocks_iter(inode, map, flags);
+
+ if (map->mpage) {
+ put_page(map->mpage);
+ map->mpage = NULL;
+ }
+ return err;
+ }
+ return erofs_map_blocks_flatmode(inode, map, flags);
+}
+
+static inline struct bio *erofs_read_raw_page(struct bio *bio,
+ struct address_space *mapping,
+ struct page *page,
+ erofs_off_t *last_block,
+ unsigned int nblocks,
+ bool ra)
+{
+ struct inode *const inode = mapping->host;
+ struct super_block *const sb = inode->i_sb;
+ erofs_off_t current_block = (erofs_off_t)page->index;
+ int err;
+
+ DBG_BUGON(!nblocks);
+
+ if (PageUptodate(page)) {
+ err = 0;
+ goto has_updated;
+ }
+
+ /* note that for readpage case, bio also equals to NULL */
+ if (bio &&
+ /* not continuous */
+ *last_block + 1 != current_block) {
+submit_bio_retry:
+ submit_bio(bio);
+ bio = NULL;
+ }
+
+ if (!bio) {
+ struct erofs_map_blocks map = {
+ .m_la = blknr_to_addr(current_block),
+ };
+ erofs_blk_t blknr;
+ unsigned int blkoff;
+
+ err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (err)
+ goto err_out;
+
+ /* zero out the holed page */
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+
+ /* imply err = 0, see erofs_map_blocks */
+ goto has_updated;
+ }
+
+ /* for RAW access mode, m_plen must be equal to m_llen */
+ DBG_BUGON(map.m_plen != map.m_llen);
+
+ blknr = erofs_blknr(map.m_pa);
+ blkoff = erofs_blkoff(map.m_pa);
+
+ /* deal with inline page */
+ if (map.m_flags & EROFS_MAP_META) {
+ void *vsrc, *vto;
+ struct page *ipage;
+
+ DBG_BUGON(map.m_plen > PAGE_SIZE);
+
+ ipage = erofs_get_meta_page(inode->i_sb, blknr);
+
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto err_out;
+ }
+
+ vsrc = kmap_atomic(ipage);
+ vto = kmap_atomic(page);
+ memcpy(vto, vsrc + blkoff, map.m_plen);
+ memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
+ kunmap_atomic(vto);
+ kunmap_atomic(vsrc);
+ flush_dcache_page(page);
+
+ SetPageUptodate(page);
+ /* TODO: could we unlock the page earlier? */
+ unlock_page(ipage);
+ put_page(ipage);
+
+ /* imply err = 0, see erofs_map_blocks */
+ goto has_updated;
+ }
+
+ /* pa must be block-aligned for raw reading */
+ DBG_BUGON(erofs_blkoff(map.m_pa));
+
+ /* max # of continuous pages */
+ if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
+ nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
+ if (nblocks > BIO_MAX_PAGES)
+ nblocks = BIO_MAX_PAGES;
+
+ bio = bio_alloc(GFP_NOIO, nblocks);
+
+ bio->bi_end_io = erofs_readendio;
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_iter.bi_sector = (sector_t)blknr <<
+ LOG_SECTORS_PER_BLOCK;
+ bio->bi_opf = REQ_OP_READ;
+ }
+
+ err = bio_add_page(bio, page, PAGE_SIZE, 0);
+ /* out of the extent or bio is full */
+ if (err < PAGE_SIZE)
+ goto submit_bio_retry;
+
+ *last_block = current_block;
+
+ /* shift in advance in case of it followed by too many gaps */
+ if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
+ /* err should reassign to 0 after submitting */
+ err = 0;
+ goto submit_bio_out;
+ }
+
+ return bio;
+
+err_out:
+ /* for sync reading, set page error immediately */
+ if (!ra) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ }
+has_updated:
+ unlock_page(page);
+
+ /* if updated manually, continuous pages has a gap */
+ if (bio)
+submit_bio_out:
+ submit_bio(bio);
+ return err ? ERR_PTR(err) : NULL;
+}
+
+/*
+ * since we dont have write or truncate flows, so no inode
+ * locking needs to be held at the moment.
+ */
+static int erofs_raw_access_readpage(struct file *file, struct page *page)
+{
+ erofs_off_t last_block;
+ struct bio *bio;
+
+ trace_erofs_readpage(page, true);
+
+ bio = erofs_read_raw_page(NULL, page->mapping,
+ page, &last_block, 1, false);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ DBG_BUGON(bio); /* since we have only one bio -- must be NULL */
+ return 0;
+}
+
+static int erofs_raw_access_readpages(struct file *filp,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned int nr_pages)
+{
+ erofs_off_t last_block;
+ struct bio *bio = NULL;
+ gfp_t gfp = readahead_gfp_mask(mapping);
+ struct page *page = list_last_entry(pages, struct page, lru);
+
+ trace_erofs_readpages(mapping->host, page, nr_pages, true);
+
+ for (; nr_pages; --nr_pages) {
+ page = list_entry(pages->prev, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+
+ if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+ bio = erofs_read_raw_page(bio, mapping, page,
+ &last_block, nr_pages, true);
+
+ /* all the page errors are ignored when readahead */
+ if (IS_ERR(bio)) {
+ pr_err("%s, readahead error at page %lu of nid %llu\n",
+ __func__, page->index,
+ EROFS_I(mapping->host)->nid);
+
+ bio = NULL;
+ }
+ }
+
+ /* pages could still be locked */
+ put_page(page);
+ }
+ DBG_BUGON(!list_empty(pages));
+
+ /* the rare case (end in gaps) */
+ if (bio)
+ submit_bio(bio);
+ return 0;
+}
+
+static int erofs_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ struct erofs_map_blocks map = {
+ .m_la = iblock << 9,
+ };
+ int err;
+
+ err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (err)
+ return err;
+
+ if (map.m_flags & EROFS_MAP_MAPPED)
+ bh->b_blocknr = erofs_blknr(map.m_pa);
+
+ return err;
+}
+
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+
+ if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
+ erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+
+ if (block >> LOG_SECTORS_PER_BLOCK >= blks)
+ return 0;
+ }
+
+ return generic_block_bmap(mapping, block, erofs_get_block);
+}
+
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations erofs_raw_access_aops = {
+ .readpage = erofs_raw_access_readpage,
+ .readpages = erofs_raw_access_readpages,
+ .bmap = erofs_bmap,
+};
+
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
new file mode 100644
index 000000000000..19f89f9fb10c
--- /dev/null
+++ b/fs/erofs/decompressor.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "compress.h"
+#include <linux/module.h>
+#include <linux/lz4.h>
+
+#ifndef LZ4_DISTANCE_MAX /* history window size */
+#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
+#endif
+
+#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
+#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
+#endif
+
+struct z_erofs_decompressor {
+ /*
+ * if destpages have sparsed pages, fill them with bounce pages.
+ * it also check whether destpages indicate continuous physical memory.
+ */
+ int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool);
+ int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
+ char *name;
+};
+
+static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ const unsigned int nr =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
+ unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
+ BITS_PER_LONG)] = { 0 };
+ void *kaddr = NULL;
+ unsigned int i, j, top;
+
+ top = 0;
+ for (i = j = 0; i < nr; ++i, ++j) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (j >= LZ4_MAX_DISTANCE_PAGES)
+ j = 0;
+
+ /* 'valid' bounced can only be tested after a complete round */
+ if (test_bit(j, bounced)) {
+ DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
+ DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
+ availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+ }
+
+ if (page) {
+ __clear_bit(j, bounced);
+ if (kaddr) {
+ if (kaddr + PAGE_SIZE == page_address(page))
+ kaddr += PAGE_SIZE;
+ else
+ kaddr = NULL;
+ } else if (!i) {
+ kaddr = page_address(page);
+ }
+ continue;
+ }
+ kaddr = NULL;
+ __set_bit(j, bounced);
+
+ if (top) {
+ victim = availables[--top];
+ get_page(victim);
+ } else {
+ victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
+ if (!victim)
+ return -ENOMEM;
+ victim->mapping = Z_EROFS_MAPPING_STAGING;
+ }
+ rq->out[i] = victim;
+ }
+ return kaddr ? 1 : 0;
+}
+
+static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
+ u8 *src, unsigned int pageofs_in)
+{
+ /*
+ * if in-place decompression is ongoing, those decompressed
+ * pages should be copied in order to avoid being overlapped.
+ */
+ struct page **in = rq->in;
+ u8 *const tmp = erofs_get_pcpubuf(0);
+ u8 *tmpp = tmp;
+ unsigned int inlen = rq->inputsize - pageofs_in;
+ unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
+
+ while (tmpp < tmp + inlen) {
+ if (!src)
+ src = kmap_atomic(*in);
+ memcpy(tmpp, src + pageofs_in, count);
+ kunmap_atomic(src);
+ src = NULL;
+ tmpp += count;
+ pageofs_in = 0;
+ count = PAGE_SIZE;
+ ++in;
+ }
+ return tmp;
+}
+
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+{
+ unsigned int inputmargin, inlen;
+ u8 *src;
+ bool copied, support_0padding;
+ int ret;
+
+ if (rq->inputsize > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ src = kmap_atomic(*rq->in);
+ inputmargin = 0;
+ support_0padding = false;
+
+ /* decompression inplace is only safe when 0padding is enabled */
+ if (EROFS_SB(rq->sb)->feature_incompat &
+ EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+ support_0padding = true;
+
+ while (!src[inputmargin & ~PAGE_MASK])
+ if (!(++inputmargin & ~PAGE_MASK))
+ break;
+
+ if (inputmargin >= rq->inputsize) {
+ kunmap_atomic(src);
+ return -EIO;
+ }
+ }
+
+ copied = false;
+ inlen = rq->inputsize - inputmargin;
+ if (rq->inplace_io) {
+ const uint oend = (rq->pageofs_out +
+ rq->outputsize) & ~PAGE_MASK;
+ const uint nr = PAGE_ALIGN(rq->pageofs_out +
+ rq->outputsize) >> PAGE_SHIFT;
+
+ if (rq->partial_decoding || !support_0padding ||
+ rq->out[nr - 1] != rq->in[0] ||
+ rq->inputsize - oend <
+ LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
+ src = generic_copy_inplace_data(rq, src, inputmargin);
+ inputmargin = 0;
+ copied = true;
+ }
+ }
+
+ ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+ inlen, rq->outputsize,
+ rq->outputsize);
+ if (ret < 0) {
+ erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]",
+ inlen, inputmargin, rq->outputsize);
+ WARN_ON(1);
+ print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
+ 16, 1, src + inputmargin, inlen, true);
+ print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
+ 16, 1, out, rq->outputsize, true);
+ ret = -EIO;
+ }
+
+ if (copied)
+ erofs_put_pcpubuf(src);
+ else
+ kunmap_atomic(src);
+ return ret;
+}
+
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .prepare_destpages = z_erofs_lz4_prepare_destpages,
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+};
+
+static void copy_from_pcpubuf(struct page **out, const char *dst,
+ unsigned short pageofs_out,
+ unsigned int outputsize)
+{
+ const char *end = dst + outputsize;
+ const unsigned int righthalf = PAGE_SIZE - pageofs_out;
+ const char *cur = dst - pageofs_out;
+
+ while (cur < end) {
+ struct page *const page = *out++;
+
+ if (page) {
+ char *buf = kmap_atomic(page);
+
+ if (cur >= dst) {
+ memcpy(buf, cur, min_t(uint, PAGE_SIZE,
+ end - cur));
+ } else {
+ memcpy(buf + pageofs_out, cur + pageofs_out,
+ min_t(uint, righthalf, end - cur));
+ }
+ kunmap_atomic(buf);
+ }
+ cur += PAGE_SIZE;
+ }
+}
+
+static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+ unsigned int dst_maptype;
+ void *dst;
+ int ret, i;
+
+ if (nrpages_out == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
+ }
+
+ /*
+ * For the case of small output size (especially much less
+ * than PAGE_SIZE), memcpy the decompressed data rather than
+ * compressed data is preferred.
+ */
+ if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+ dst = erofs_get_pcpubuf(0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ rq->inplace_io = false;
+ ret = alg->decompress(rq, dst);
+ if (!ret)
+ copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+ rq->outputsize);
+
+ erofs_put_pcpubuf(dst);
+ return ret;
+ }
+
+ ret = alg->prepare_destpages(rq, pagepool);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ goto dstmap_out;
+ }
+
+ i = 0;
+ while (1) {
+ dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
+
+ /* retry two more times (totally 3 times) */
+ if (dst || ++i >= 3)
+ break;
+ vm_unmap_aliases();
+ }
+
+ if (!dst)
+ return -ENOMEM;
+
+ dst_maptype = 2;
+
+dstmap_out:
+ ret = alg->decompress(rq, dst + rq->pageofs_out);
+
+ if (!dst_maptype)
+ kunmap_atomic(dst);
+ else if (dst_maptype == 2)
+ vm_unmap_ram(dst, nrpages_out);
+ return ret;
+}
+
+static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+ unsigned char *src, *dst;
+
+ if (nrpages_out > 2) {
+ DBG_BUGON(1);
+ return -EIO;
+ }
+
+ if (rq->out[0] == *rq->in) {
+ DBG_BUGON(nrpages_out != 1);
+ return 0;
+ }
+
+ src = kmap_atomic(*rq->in);
+ if (!rq->out[0]) {
+ dst = NULL;
+ } else {
+ dst = kmap_atomic(rq->out[0]);
+ memcpy(dst + rq->pageofs_out, src, righthalf);
+ }
+
+ if (rq->out[1] == *rq->in) {
+ memmove(src, src + righthalf, rq->pageofs_out);
+ } else if (nrpages_out == 2) {
+ if (dst)
+ kunmap_atomic(dst);
+ DBG_BUGON(!rq->out[1]);
+ dst = kmap_atomic(rq->out[1]);
+ memcpy(dst, src + righthalf, rq->pageofs_out);
+ }
+ if (dst)
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+ return 0;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+{
+ if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
+ return z_erofs_shifted_transform(rq, pagepool);
+ return z_erofs_decompress_generic(rq, pagepool);
+}
+
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
new file mode 100644
index 000000000000..d28c623dfef9
--- /dev/null
+++ b/fs/erofs/dir.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+
+static void debug_one_dentry(unsigned char d_type, const char *de_name,
+ unsigned int de_namelen)
+{
+#ifdef CONFIG_EROFS_FS_DEBUG
+ /* since the on-disk name could not have the trailing '\0' */
+ unsigned char dbg_namebuf[EROFS_NAME_LEN + 1];
+
+ memcpy(dbg_namebuf, de_name, de_namelen);
+ dbg_namebuf[de_namelen] = '\0';
+
+ erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf,
+ de_namelen, d_type);
+#endif
+}
+
+static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
+ void *dentry_blk, unsigned int *ofs,
+ unsigned int nameoff, unsigned int maxsize)
+{
+ struct erofs_dirent *de = dentry_blk + *ofs;
+ const struct erofs_dirent *end = dentry_blk + nameoff;
+
+ while (de < end) {
+ const char *de_name;
+ unsigned int de_namelen;
+ unsigned char d_type;
+
+ d_type = fs_ftype_to_dtype(de->file_type);
+
+ nameoff = le16_to_cpu(de->nameoff);
+ de_name = (char *)dentry_blk + nameoff;
+
+ /* the last dirent in the block? */
+ if (de + 1 >= end)
+ de_namelen = strnlen(de_name, maxsize - nameoff);
+ else
+ de_namelen = le16_to_cpu(de[1].nameoff) - nameoff;
+
+ /* a corrupted entry is found */
+ if (nameoff + de_namelen > maxsize ||
+ de_namelen > EROFS_NAME_LEN) {
+ erofs_err(dir->i_sb, "bogus dirent @ nid %llu",
+ EROFS_I(dir)->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ debug_one_dentry(d_type, de_name, de_namelen);
+ if (!dir_emit(ctx, de_name, de_namelen,
+ le64_to_cpu(de->nid), d_type))
+ /* stopped by some reason */
+ return 1;
+ ++de;
+ *ofs += sizeof(struct erofs_dirent);
+ }
+ *ofs = maxsize;
+ return 0;
+}
+
+static int erofs_readdir(struct file *f, struct dir_context *ctx)
+{
+ struct inode *dir = file_inode(f);
+ struct address_space *mapping = dir->i_mapping;
+ const size_t dirsize = i_size_read(dir);
+ unsigned int i = ctx->pos / EROFS_BLKSIZ;
+ unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
+ int err = 0;
+ bool initial = true;
+
+ while (ctx->pos < dirsize) {
+ struct page *dentry_page;
+ struct erofs_dirent *de;
+ unsigned int nameoff, maxsize;
+
+ dentry_page = read_mapping_page(mapping, i, NULL);
+ if (dentry_page == ERR_PTR(-ENOMEM)) {
+ err = -ENOMEM;
+ break;
+ } else if (IS_ERR(dentry_page)) {
+ erofs_err(dir->i_sb,
+ "fail to readdir of logical block %u of nid %llu",
+ i, EROFS_I(dir)->nid);
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ de = (struct erofs_dirent *)kmap(dentry_page);
+
+ nameoff = le16_to_cpu(de->nameoff);
+
+ if (nameoff < sizeof(struct erofs_dirent) ||
+ nameoff >= PAGE_SIZE) {
+ erofs_err(dir->i_sb,
+ "invalid de[0].nameoff %u @ nid %llu",
+ nameoff, EROFS_I(dir)->nid);
+ err = -EFSCORRUPTED;
+ goto skip_this;
+ }
+
+ maxsize = min_t(unsigned int,
+ dirsize - ctx->pos + ofs, PAGE_SIZE);
+
+ /* search dirents at the arbitrary position */
+ if (initial) {
+ initial = false;
+
+ ofs = roundup(ofs, sizeof(struct erofs_dirent));
+ if (ofs >= nameoff)
+ goto skip_this;
+ }
+
+ err = erofs_fill_dentries(dir, ctx, de, &ofs,
+ nameoff, maxsize);
+skip_this:
+ kunmap(dentry_page);
+
+ put_page(dentry_page);
+
+ ctx->pos = blknr_to_addr(i) + ofs;
+
+ if (err)
+ break;
+ ++i;
+ ofs = 0;
+ }
+ return err < 0 ? err : 0;
+}
+
+const struct file_operations erofs_dir_fops = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = erofs_readdir,
+};
+
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
new file mode 100644
index 000000000000..b1ee5654750d
--- /dev/null
+++ b/fs/erofs/erofs_fs.h
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/*
+ * EROFS (Enhanced ROM File System) on-disk format definition
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_H
+#define __EROFS_FS_H
+
+#define EROFS_SUPER_OFFSET 1024
+
+/*
+ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
+ * be incompatible with this kernel version.
+ */
+#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
+#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+
+/* 128-byte erofs on-disk super block */
+struct erofs_super_block {
+ __le32 magic; /* file system magic number */
+ __le32 checksum; /* crc32c(super_block) */
+ __le32 feature_compat;
+ __u8 blkszbits; /* support block_size == PAGE_SIZE only */
+ __u8 reserved;
+
+ __le16 root_nid; /* nid of root directory */
+ __le64 inos; /* total valid ino # (== f_files - f_favail) */
+
+ __le64 build_time; /* inode v1 time derivation */
+ __le32 build_time_nsec; /* inode v1 time derivation in nano scale */
+ __le32 blocks; /* used for statfs */
+ __le32 meta_blkaddr; /* start block address of metadata area */
+ __le32 xattr_blkaddr; /* start block address of shared xattr area */
+ __u8 uuid[16]; /* 128-bit uuid for volume */
+ __u8 volume_name[16]; /* volume name */
+ __le32 feature_incompat;
+
+ __u8 reserved2[44];
+};
+
+/*
+ * erofs inode datalayout (i_format in on-disk inode):
+ * 0 - inode plain without inline data A:
+ * inode, [xattrs], ... | ... | no-holed data
+ * 1 - inode VLE compression B (legacy):
+ * inode, [xattrs], extents ... | ...
+ * 2 - inode plain with inline data C:
+ * inode, [xattrs], last_inline_data, ... | ... | no-holed data
+ * 3 - inode compression D:
+ * inode, [xattrs], map_header, extents ... | ...
+ * 4~7 - reserved
+ */
+enum {
+ EROFS_INODE_FLAT_PLAIN = 0,
+ EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1,
+ EROFS_INODE_FLAT_INLINE = 2,
+ EROFS_INODE_FLAT_COMPRESSION = 3,
+ EROFS_INODE_DATALAYOUT_MAX
+};
+
+static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
+{
+ return datamode == EROFS_INODE_FLAT_COMPRESSION ||
+ datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
+}
+
+/* bit definitions of inode i_advise */
+#define EROFS_I_VERSION_BITS 1
+#define EROFS_I_DATALAYOUT_BITS 3
+
+#define EROFS_I_VERSION_BIT 0
+#define EROFS_I_DATALAYOUT_BIT 1
+
+/* 32-byte reduced form of an ondisk inode */
+struct erofs_inode_compact {
+ __le16 i_format; /* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+ __le16 i_xattr_icount;
+ __le16 i_mode;
+ __le16 i_nlink;
+ __le32 i_size;
+ __le32 i_reserved;
+ union {
+ /* file total compressed blocks for data mapping 1 */
+ __le32 compressed_blocks;
+ __le32 raw_blkaddr;
+
+ /* for device files, used to indicate old/new device # */
+ __le32 rdev;
+ } i_u;
+ __le32 i_ino; /* only used for 32-bit stat compatibility */
+ __le16 i_uid;
+ __le16 i_gid;
+ __le32 i_reserved2;
+};
+
+/* 32 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_COMPACT 0
+/* 64 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_EXTENDED 1
+
+/* 64-byte complete form of an ondisk inode */
+struct erofs_inode_extended {
+ __le16 i_format; /* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+ __le16 i_xattr_icount;
+ __le16 i_mode;
+ __le16 i_reserved;
+ __le64 i_size;
+ union {
+ /* file total compressed blocks for data mapping 1 */
+ __le32 compressed_blocks;
+ __le32 raw_blkaddr;
+
+ /* for device files, used to indicate old/new device # */
+ __le32 rdev;
+ } i_u;
+
+ /* only used for 32-bit stat compatibility */
+ __le32 i_ino;
+
+ __le32 i_uid;
+ __le32 i_gid;
+ __le64 i_ctime;
+ __le32 i_ctime_nsec;
+ __le32 i_nlink;
+ __u8 i_reserved2[16];
+};
+
+#define EROFS_MAX_SHARED_XATTRS (128)
+/* h_shared_count between 129 ... 255 are special # */
+#define EROFS_SHARED_XATTR_EXTENT (255)
+
+/*
+ * inline xattrs (n == i_xattr_icount):
+ * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes
+ * 12 bytes / \
+ * / \
+ * /-----------------------\
+ * | erofs_xattr_entries+ |
+ * +-----------------------+
+ * inline xattrs must starts in erofs_xattr_ibody_header,
+ * for read-only fs, no need to introduce h_refcount
+ */
+struct erofs_xattr_ibody_header {
+ __le32 h_reserved;
+ __u8 h_shared_count;
+ __u8 h_reserved2[7];
+ __le32 h_shared_xattrs[0]; /* shared xattr id array */
+};
+
+/* Name indexes */
+#define EROFS_XATTR_INDEX_USER 1
+#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2
+#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define EROFS_XATTR_INDEX_TRUSTED 4
+#define EROFS_XATTR_INDEX_LUSTRE 5
+#define EROFS_XATTR_INDEX_SECURITY 6
+
+/* xattr entry (for both inline & shared xattrs) */
+struct erofs_xattr_entry {
+ __u8 e_name_len; /* length of name */
+ __u8 e_name_index; /* attribute name index */
+ __le16 e_value_size; /* size of attribute value */
+ /* followed by e_name and e_value */
+ char e_name[0]; /* attribute name */
+};
+
+static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
+{
+ if (!i_xattr_icount)
+ return 0;
+
+ return sizeof(struct erofs_xattr_ibody_header) +
+ sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
+}
+
+#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry))
+
+static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
+{
+ return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) +
+ e->e_name_len + le16_to_cpu(e->e_value_size));
+}
+
+/* available compression algorithm types (for h_algorithmtype) */
+enum {
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_MAX
+};
+
+/*
+ * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
+ * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
+ * (4B) + 2B + (4B) if compacted 2B is on.
+ */
+#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0
+
+#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+
+struct z_erofs_map_header {
+ __le32 h_reserved1;
+ __le16 h_advise;
+ /*
+ * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
+ * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
+ */
+ __u8 h_algorithmtype;
+ /*
+ * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
+ * bit 3-4 : (physical - logical) cluster bits of head 1:
+ * For example, if logical clustersize = 4096, 1 for 8192.
+ * bit 5-7 : (physical - logical) cluster bits of head 2.
+ */
+ __u8 h_clusterbits;
+};
+
+#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
+
+/*
+ * Fixed-sized output compression ondisk Logical Extent cluster type:
+ * 0 - literal (uncompressed) cluster
+ * 1 - compressed cluster (for the head logical cluster)
+ * 2 - compressed cluster (for the other logical clusters)
+ *
+ * In detail,
+ * 0 - literal (uncompressed) cluster,
+ * di_advise = 0
+ * di_clusterofs = the literal data offset of the cluster
+ * di_blkaddr = the blkaddr of the literal cluster
+ *
+ * 1 - compressed cluster (for the head logical cluster)
+ * di_advise = 1
+ * di_clusterofs = the decompressed data offset of the cluster
+ * di_blkaddr = the blkaddr of the compressed cluster
+ *
+ * 2 - compressed cluster (for the other logical clusters)
+ * di_advise = 2
+ * di_clusterofs =
+ * the decompressed data offset in its own head cluster
+ * di_u.delta[0] = distance to its corresponding head cluster
+ * di_u.delta[1] = distance to its corresponding tail cluster
+ * (di_advise could be 0, 1 or 2)
+ */
+enum {
+ Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
+ Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
+#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
+
+struct z_erofs_vle_decompressed_index {
+ __le16 di_advise;
+ /* where to decompress in the head cluster */
+ __le16 di_clusterofs;
+
+ union {
+ /* for the head cluster */
+ __le32 blkaddr;
+ /*
+ * for the rest clusters
+ * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
+ * [0] - pointing to the head cluster
+ * [1] - pointing to the tail cluster
+ */
+ __le16 delta[2];
+ } di_u;
+};
+
+#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \
+ (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \
+ sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING)
+
+/* dirent sorts in alphabet order, thus we can do binary search */
+struct erofs_dirent {
+ __le64 nid; /* node number */
+ __le16 nameoff; /* start offset of file name */
+ __u8 file_type; /* file type */
+ __u8 reserved; /* reserved */
+} __packed;
+
+/*
+ * EROFS file types should match generic FT_* types and
+ * it seems no need to add BUILD_BUG_ONs since potential
+ * unmatchness will break other fses as well...
+ */
+
+#define EROFS_NAME_LEN 255
+
+/* check the EROFS on-disk layout strictly at compile time */
+static inline void erofs_check_ondisk_layout_definitions(void)
+{
+ BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
+ BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
+ BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+ BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
+ BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
+ BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+
+ BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
+ Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+}
+
+#endif
+
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
new file mode 100644
index 000000000000..3350ab65d892
--- /dev/null
+++ b/fs/erofs/inode.c
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "xattr.h"
+
+#include <trace/events/erofs.h>
+
+/* no locking */
+static int erofs_read_inode(struct inode *inode, void *data)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_inode_compact *dic = data;
+ struct erofs_inode_extended *die;
+
+ const unsigned int ifmt = le16_to_cpu(dic->i_format);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+ erofs_blk_t nblks = 0;
+
+ vi->datalayout = erofs_inode_datalayout(ifmt);
+
+ if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
+ erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+ vi->datalayout, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+
+ switch (erofs_inode_version(ifmt)) {
+ case EROFS_INODE_LAYOUT_EXTENDED:
+ die = data;
+
+ vi->inode_isize = sizeof(struct erofs_inode_extended);
+ vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
+
+ inode->i_mode = le16_to_cpu(die->i_mode);
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev =
+ new_decode_dev(le32_to_cpu(die->i_u.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ goto bogusimode;
+ }
+ i_uid_write(inode, le32_to_cpu(die->i_uid));
+ i_gid_write(inode, le32_to_cpu(die->i_gid));
+ set_nlink(inode, le32_to_cpu(die->i_nlink));
+
+ /* ns timestamp */
+ inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
+ le64_to_cpu(die->i_ctime);
+ inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
+ le32_to_cpu(die->i_ctime_nsec);
+
+ inode->i_size = le64_to_cpu(die->i_size);
+
+ /* total blocks for compressed files */
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ nblks = le32_to_cpu(die->i_u.compressed_blocks);
+ break;
+ case EROFS_INODE_LAYOUT_COMPACT:
+ vi->inode_isize = sizeof(struct erofs_inode_compact);
+ vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
+
+ inode->i_mode = le16_to_cpu(dic->i_mode);
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ inode->i_rdev =
+ new_decode_dev(le32_to_cpu(dic->i_u.rdev));
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_rdev = 0;
+ break;
+ default:
+ goto bogusimode;
+ }
+ i_uid_write(inode, le16_to_cpu(dic->i_uid));
+ i_gid_write(inode, le16_to_cpu(dic->i_gid));
+ set_nlink(inode, le16_to_cpu(dic->i_nlink));
+
+ /* use build time to derive all file time */
+ inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
+ sbi->build_time;
+ inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
+ sbi->build_time_nsec;
+
+ inode->i_size = le32_to_cpu(dic->i_size);
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ nblks = le32_to_cpu(dic->i_u.compressed_blocks);
+ break;
+ default:
+ erofs_err(inode->i_sb,
+ "unsupported on-disk inode version %u of nid %llu",
+ erofs_inode_version(ifmt), vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+
+ if (!nblks)
+ /* measure inode.i_blocks as generic filesystems */
+ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
+ else
+ inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
+ return 0;
+
+bogusimode:
+ erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
+ inode->i_mode, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+}
+
+static int erofs_fill_symlink(struct inode *inode, void *data,
+ unsigned int m_pofs)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ char *lnk;
+
+ /* if it cannot be handled with fast symlink scheme */
+ if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+ inode->i_size >= PAGE_SIZE) {
+ inode->i_op = &erofs_symlink_iops;
+ return 0;
+ }
+
+ lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
+ if (!lnk)
+ return -ENOMEM;
+
+ m_pofs += vi->inode_isize + vi->xattr_isize;
+ /* inline symlink data shouldn't cross page boundary as well */
+ if (m_pofs + inode->i_size > PAGE_SIZE) {
+ kfree(lnk);
+ erofs_err(inode->i_sb,
+ "inline data cross block boundary @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ memcpy(lnk, data + m_pofs, inode->i_size);
+ lnk[inode->i_size] = '\0';
+
+ inode->i_link = lnk;
+ inode->i_op = &erofs_fast_symlink_iops;
+ return 0;
+}
+
+static int erofs_fill_inode(struct inode *inode, int isdir)
+{
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct page *page;
+ void *data;
+ int err;
+ erofs_blk_t blkaddr;
+ unsigned int ofs;
+ erofs_off_t inode_loc;
+
+ trace_erofs_fill_inode(inode, isdir);
+ inode_loc = iloc(EROFS_SB(sb), vi->nid);
+ blkaddr = erofs_blknr(inode_loc);
+ ofs = erofs_blkoff(inode_loc);
+
+ erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
+ __func__, vi->nid, ofs, blkaddr);
+
+ page = erofs_get_meta_page(sb, blkaddr);
+
+ if (IS_ERR(page)) {
+ erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
+ vi->nid, PTR_ERR(page));
+ return PTR_ERR(page);
+ }
+
+ DBG_BUGON(!PageUptodate(page));
+ data = page_address(page);
+
+ err = erofs_read_inode(inode, data + ofs);
+ if (err)
+ goto out_unlock;
+
+ /* setup the new inode */
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &erofs_generic_iops;
+ inode->i_fop = &generic_ro_fops;
+ break;
+ case S_IFDIR:
+ inode->i_op = &erofs_dir_iops;
+ inode->i_fop = &erofs_dir_fops;
+ break;
+ case S_IFLNK:
+ err = erofs_fill_symlink(inode, data, ofs);
+ if (err)
+ goto out_unlock;
+ inode_nohighmem(inode);
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ inode->i_op = &erofs_generic_iops;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ goto out_unlock;
+ default:
+ err = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ if (erofs_inode_is_data_compressed(vi->datalayout)) {
+ err = z_erofs_fill_inode(inode);
+ goto out_unlock;
+ }
+ inode->i_mapping->a_ops = &erofs_raw_access_aops;
+
+out_unlock:
+ unlock_page(page);
+ put_page(page);
+ return err;
+}
+
+/*
+ * erofs nid is 64bits, but i_ino is 'unsigned long', therefore
+ * we should do more for 32-bit platform to find the right inode.
+ */
+static int erofs_ilookup_test_actor(struct inode *inode, void *opaque)
+{
+ const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+
+ return EROFS_I(inode)->nid == nid;
+}
+
+static int erofs_iget_set_actor(struct inode *inode, void *opaque)
+{
+ const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+
+ inode->i_ino = erofs_inode_hash(nid);
+ return 0;
+}
+
+static inline struct inode *erofs_iget_locked(struct super_block *sb,
+ erofs_nid_t nid)
+{
+ const unsigned long hashval = erofs_inode_hash(nid);
+
+ return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+ erofs_iget_set_actor, &nid);
+}
+
+struct inode *erofs_iget(struct super_block *sb,
+ erofs_nid_t nid,
+ bool isdir)
+{
+ struct inode *inode = erofs_iget_locked(sb, nid);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW) {
+ int err;
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ vi->nid = nid;
+
+ err = erofs_fill_inode(inode, isdir);
+ if (!err)
+ unlock_new_inode(inode);
+ else {
+ iget_failed(inode);
+ inode = ERR_PTR(err);
+ }
+ }
+ return inode;
+}
+
+int erofs_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *const inode = d_inode(path->dentry);
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+ STATX_ATTR_IMMUTABLE);
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
+const struct inode_operations erofs_generic_iops = {
+ .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+ .listxattr = erofs_listxattr,
+#endif
+ .get_acl = erofs_get_acl,
+};
+
+const struct inode_operations erofs_symlink_iops = {
+ .get_link = page_get_link,
+ .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+ .listxattr = erofs_listxattr,
+#endif
+ .get_acl = erofs_get_acl,
+};
+
+const struct inode_operations erofs_fast_symlink_iops = {
+ .get_link = simple_get_link,
+ .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+ .listxattr = erofs_listxattr,
+#endif
+ .get_acl = erofs_get_acl,
+};
+
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
new file mode 100644
index 000000000000..544a453f3076
--- /dev/null
+++ b/fs/erofs/internal.h
@@ -0,0 +1,431 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_INTERNAL_H
+#define __EROFS_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "erofs_fs.h"
+
+/* redefine pr_fmt "erofs: " */
+#undef pr_fmt
+#define pr_fmt(fmt) "erofs: " fmt
+
+__printf(3, 4) void _erofs_err(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define erofs_err(sb, fmt, ...) \
+ _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
+__printf(3, 4) void _erofs_info(struct super_block *sb,
+ const char *function, const char *fmt, ...);
+#define erofs_info(sb, fmt, ...) \
+ _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+#ifdef CONFIG_EROFS_FS_DEBUG
+#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__)
+#define DBG_BUGON BUG_ON
+#else
+#define erofs_dbg(x, ...) ((void)0)
+#define DBG_BUGON(x) ((void)(x))
+#endif /* !CONFIG_EROFS_FS_DEBUG */
+
+/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
+#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1
+
+typedef u64 erofs_nid_t;
+typedef u64 erofs_off_t;
+/* data type for filesystem-wide blocks number */
+typedef u32 erofs_blk_t;
+
+struct erofs_sb_info {
+#ifdef CONFIG_EROFS_FS_ZIP
+ /* list for all registered superblocks, mainly for shrinker */
+ struct list_head list;
+ struct mutex umount_mutex;
+
+ /* the dedicated workstation for compression */
+ struct radix_tree_root workstn_tree;
+
+ /* threshold for decompression synchronously */
+ unsigned int max_sync_decompress_pages;
+
+ unsigned int shrinker_run_no;
+
+ /* current strategy of how to use managed cache */
+ unsigned char cache_strategy;
+
+ /* pseudo inode to manage cached pages */
+ struct inode *managed_cache;
+#endif /* CONFIG_EROFS_FS_ZIP */
+ u32 blocks;
+ u32 meta_blkaddr;
+#ifdef CONFIG_EROFS_FS_XATTR
+ u32 xattr_blkaddr;
+#endif
+
+ /* inode slot unit size in bit shift */
+ unsigned char islotbits;
+
+ u32 build_time_nsec;
+ u64 build_time;
+
+ /* what we really care is nid, rather than ino.. */
+ erofs_nid_t root_nid;
+ /* used for statfs, f_files - f_favail */
+ u64 inos;
+
+ u8 uuid[16]; /* 128-bit uuid for volume */
+ u8 volume_name[16]; /* volume name */
+ u32 feature_incompat;
+
+ unsigned int mount_opt;
+};
+
+#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
+#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
+
+/* Mount flags set via mount options or defaults */
+#define EROFS_MOUNT_XATTR_USER 0x00000010
+#define EROFS_MOUNT_POSIX_ACL 0x00000020
+
+#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option)
+
+#ifdef CONFIG_EROFS_FS_ZIP
+enum {
+ EROFS_ZIP_CACHE_DISABLED,
+ EROFS_ZIP_CACHE_READAHEAD,
+ EROFS_ZIP_CACHE_READAROUND
+};
+
+#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+ /* the workgroup index in the workstation */
+ pgoff_t index;
+
+ /* overall workgroup reference count */
+ atomic_t refcount;
+};
+
+#if defined(CONFIG_SMP)
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+ int val)
+{
+ preempt_disable();
+ if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
+ preempt_enable();
+ return false;
+ }
+ return true;
+}
+
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+ int orig_val)
+{
+ /*
+ * other observers should notice all modifications
+ * in the freezing period.
+ */
+ smp_mb();
+ atomic_set(&grp->refcount, orig_val);
+ preempt_enable();
+}
+
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+ return atomic_cond_read_relaxed(&grp->refcount,
+ VAL != EROFS_LOCKED_MAGIC);
+}
+#else
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+ int val)
+{
+ preempt_disable();
+ /* no need to spin on UP platforms, let's just disable preemption. */
+ if (val != atomic_read(&grp->refcount)) {
+ preempt_enable();
+ return false;
+ }
+ return true;
+}
+
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+ int orig_val)
+{
+ preempt_enable();
+}
+
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+ int v = atomic_read(&grp->refcount);
+
+ /* workgroup is never freezed on uniprocessor systems */
+ DBG_BUGON(v == EROFS_LOCKED_MAGIC);
+ return v;
+}
+#endif /* !CONFIG_SMP */
+
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PCPUBUF_NR_PAGES 0
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
+/* we strictly follow PAGE_SIZE and no buffer head yet */
+#define LOG_BLOCK_SIZE PAGE_SHIFT
+
+#undef LOG_SECTORS_PER_BLOCK
+#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9)
+
+#undef SECTORS_PER_BLOCK
+#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK)
+
+#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE)
+
+#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ)
+#error erofs cannot be used in this platform
+#endif
+
+#define ROOT_NID(sb) ((sb)->root_nid)
+
+#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
+#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ)
+#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ)
+
+static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+ return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
+}
+
+/* atomic flag definitions */
+#define EROFS_I_EA_INITED_BIT 0
+#define EROFS_I_Z_INITED_BIT 1
+
+/* bitlock definitions (arranged in reverse order) */
+#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1)
+#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2)
+
+struct erofs_inode {
+ erofs_nid_t nid;
+
+ /* atomic flags (including bitlocks) */
+ unsigned long flags;
+
+ unsigned char datalayout;
+ unsigned char inode_isize;
+ unsigned short xattr_isize;
+
+ unsigned int xattr_shared_count;
+ unsigned int *xattr_shared_xattrs;
+
+ union {
+ erofs_blk_t raw_blkaddr;
+#ifdef CONFIG_EROFS_FS_ZIP
+ struct {
+ unsigned short z_advise;
+ unsigned char z_algorithmtype[2];
+ unsigned char z_logical_clusterbits;
+ unsigned char z_physical_clusterbits[2];
+ };
+#endif /* CONFIG_EROFS_FS_ZIP */
+ };
+ /* the corresponding vfs inode */
+ struct inode vfs_inode;
+};
+
+#define EROFS_I(ptr) \
+ container_of(ptr, struct erofs_inode, vfs_inode)
+
+static inline unsigned long erofs_inode_datablocks(struct inode *inode)
+{
+ /* since i_size cannot be changed */
+ return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+}
+
+static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit,
+ unsigned int bits)
+{
+
+ return (value >> bit) & ((1 << bits) - 1);
+}
+
+
+static inline unsigned int erofs_inode_version(unsigned int value)
+{
+ return erofs_bitrange(value, EROFS_I_VERSION_BIT,
+ EROFS_I_VERSION_BITS);
+}
+
+static inline unsigned int erofs_inode_datalayout(unsigned int value)
+{
+ return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT,
+ EROFS_I_DATALAYOUT_BITS);
+}
+
+extern const struct super_operations erofs_sops;
+
+extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
+#endif
+
+/*
+ * Logical to physical block mapping, used by erofs_map_blocks()
+ *
+ * Different with other file systems, it is used for 2 access modes:
+ *
+ * 1) RAW access mode:
+ *
+ * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
+ * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
+ *
+ * Note that m_lblk in the RAW access mode refers to the number of
+ * the compressed ondisk block rather than the uncompressed
+ * in-memory block for the compressed file.
+ *
+ * m_pofs equals to m_lofs except for the inline data page.
+ *
+ * 2) Normal access mode:
+ *
+ * If the inode is not compressed, it has no difference with
+ * the RAW access mode. However, if the inode is compressed,
+ * users should pass a valid (m_lblk, m_lofs) pair, and get
+ * the needed m_pblk, m_pofs, m_len to get the compressed data
+ * and the updated m_lblk, m_lofs which indicates the start
+ * of the corresponding uncompressed data in the file.
+ */
+enum {
+ BH_Zipped = BH_PrivateStart,
+ BH_FullMapped,
+};
+
+/* Has a disk mapping */
+#define EROFS_MAP_MAPPED (1 << BH_Mapped)
+/* Located in metadata (could be copied from bd_inode) */
+#define EROFS_MAP_META (1 << BH_Meta)
+/* The extent has been compressed */
+#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
+/* The length of extent is full */
+#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
+
+struct erofs_map_blocks {
+ erofs_off_t m_pa, m_la;
+ u64 m_plen, m_llen;
+
+ unsigned int m_flags;
+
+ struct page *mpage;
+};
+
+/* Flags used by erofs_map_blocks() */
+#define EROFS_GET_BLOCKS_RAW 0x0001
+
+/* zmap.c */
+#ifdef CONFIG_EROFS_FS_ZIP
+int z_erofs_fill_inode(struct inode *inode);
+int z_erofs_map_blocks_iter(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags);
+#else
+static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; }
+static inline int z_erofs_map_blocks_iter(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
+/* data.c */
+struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+
+int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
+
+/* inode.c */
+static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
+{
+#if BITS_PER_LONG == 32
+ return (nid >> 32) ^ (nid & 0xffffffff);
+#else
+ return nid;
+#endif
+}
+
+extern const struct inode_operations erofs_generic_iops;
+extern const struct inode_operations erofs_symlink_iops;
+extern const struct inode_operations erofs_fast_symlink_iops;
+
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+int erofs_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags);
+
+/* namei.c */
+extern const struct inode_operations erofs_dir_iops;
+
+int erofs_namei(struct inode *dir, struct qstr *name,
+ erofs_nid_t *nid, unsigned int *d_type);
+
+/* dir.c */
+extern const struct file_operations erofs_dir_fops;
+
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail);
+
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+void *erofs_get_pcpubuf(unsigned int pagenr);
+#define erofs_put_pcpubuf(buf) do { \
+ (void)&(buf); \
+ preempt_enable(); \
+} while (0)
+#else
+static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+#define erofs_put_pcpubuf(buf) do {} while (0)
+#endif
+
+#ifdef CONFIG_EROFS_FS_ZIP
+int erofs_workgroup_put(struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+ pgoff_t index, bool *tag);
+int erofs_register_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp, bool tag);
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+void erofs_shrinker_register(struct super_block *sb);
+void erofs_shrinker_unregister(struct super_block *sb);
+int __init erofs_init_shrinker(void);
+void erofs_exit_shrinker(void);
+int __init z_erofs_init_zip_subsystem(void);
+void z_erofs_exit_zip_subsystem(void);
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *egrp);
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+ struct page *page);
+#else
+static inline void erofs_shrinker_register(struct super_block *sb) {}
+static inline void erofs_shrinker_unregister(struct super_block *sb) {}
+static inline int erofs_init_shrinker(void) { return 0; }
+static inline void erofs_exit_shrinker(void) {}
+static inline int z_erofs_init_zip_subsystem(void) { return 0; }
+static inline void z_erofs_exit_zip_subsystem(void) {}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
+#endif /* __EROFS_INTERNAL_H */
+
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
new file mode 100644
index 000000000000..3abbecbf73de
--- /dev/null
+++ b/fs/erofs/namei.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "xattr.h"
+
+#include <trace/events/erofs.h>
+
+struct erofs_qstr {
+ const unsigned char *name;
+ const unsigned char *end;
+};
+
+/* based on the end of qn is accurate and it must have the trailing '\0' */
+static inline int erofs_dirnamecmp(const struct erofs_qstr *qn,
+ const struct erofs_qstr *qd,
+ unsigned int *matched)
+{
+ unsigned int i = *matched;
+
+ /*
+ * on-disk error, let's only BUG_ON in the debugging mode.
+ * otherwise, it will return 1 to just skip the invalid name
+ * and go on (in consideration of the lookup performance).
+ */
+ DBG_BUGON(qd->name > qd->end);
+
+ /* qd could not have trailing '\0' */
+ /* However it is absolutely safe if < qd->end */
+ while (qd->name + i < qd->end && qd->name[i] != '\0') {
+ if (qn->name[i] != qd->name[i]) {
+ *matched = i;
+ return qn->name[i] > qd->name[i] ? 1 : -1;
+ }
+ ++i;
+ }
+ *matched = i;
+ /* See comments in __d_alloc on the terminating NUL character */
+ return qn->name[i] == '\0' ? 0 : 1;
+}
+
+#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1))
+
+static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
+ u8 *data,
+ unsigned int dirblksize,
+ const int ndirents)
+{
+ int head, back;
+ unsigned int startprfx, endprfx;
+ struct erofs_dirent *const de = (struct erofs_dirent *)data;
+
+ /* since the 1st dirent has been evaluated previously */
+ head = 1;
+ back = ndirents - 1;
+ startprfx = endprfx = 0;
+
+ while (head <= back) {
+ const int mid = head + (back - head) / 2;
+ const int nameoff = nameoff_from_disk(de[mid].nameoff,
+ dirblksize);
+ unsigned int matched = min(startprfx, endprfx);
+ struct erofs_qstr dname = {
+ .name = data + nameoff,
+ .end = mid >= ndirents - 1 ?
+ data + dirblksize :
+ data + nameoff_from_disk(de[mid + 1].nameoff,
+ dirblksize)
+ };
+
+ /* string comparison without already matched prefix */
+ int ret = erofs_dirnamecmp(name, &dname, &matched);
+
+ if (!ret) {
+ return de + mid;
+ } else if (ret > 0) {
+ head = mid + 1;
+ startprfx = matched;
+ } else {
+ back = mid - 1;
+ endprfx = matched;
+ }
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct page *find_target_block_classic(struct inode *dir,
+ struct erofs_qstr *name,
+ int *_ndirents)
+{
+ unsigned int startprfx, endprfx;
+ int head, back;
+ struct address_space *const mapping = dir->i_mapping;
+ struct page *candidate = ERR_PTR(-ENOENT);
+
+ startprfx = endprfx = 0;
+ head = 0;
+ back = erofs_inode_datablocks(dir) - 1;
+
+ while (head <= back) {
+ const int mid = head + (back - head) / 2;
+ struct page *page = read_mapping_page(mapping, mid, NULL);
+
+ if (!IS_ERR(page)) {
+ struct erofs_dirent *de = kmap_atomic(page);
+ const int nameoff = nameoff_from_disk(de->nameoff,
+ EROFS_BLKSIZ);
+ const int ndirents = nameoff / sizeof(*de);
+ int diff;
+ unsigned int matched;
+ struct erofs_qstr dname;
+
+ if (!ndirents) {
+ kunmap_atomic(de);
+ put_page(page);
+ erofs_err(dir->i_sb,
+ "corrupted dir block %d @ nid %llu",
+ mid, EROFS_I(dir)->nid);
+ DBG_BUGON(1);
+ page = ERR_PTR(-EFSCORRUPTED);
+ goto out;
+ }
+
+ matched = min(startprfx, endprfx);
+
+ dname.name = (u8 *)de + nameoff;
+ if (ndirents == 1)
+ dname.end = (u8 *)de + EROFS_BLKSIZ;
+ else
+ dname.end = (u8 *)de +
+ nameoff_from_disk(de[1].nameoff,
+ EROFS_BLKSIZ);
+
+ /* string comparison without already matched prefix */
+ diff = erofs_dirnamecmp(name, &dname, &matched);
+ kunmap_atomic(de);
+
+ if (!diff) {
+ *_ndirents = 0;
+ goto out;
+ } else if (diff > 0) {
+ head = mid + 1;
+ startprfx = matched;
+
+ if (!IS_ERR(candidate))
+ put_page(candidate);
+ candidate = page;
+ *_ndirents = ndirents;
+ } else {
+ put_page(page);
+
+ back = mid - 1;
+ endprfx = matched;
+ }
+ continue;
+ }
+out: /* free if the candidate is valid */
+ if (!IS_ERR(candidate))
+ put_page(candidate);
+ return page;
+ }
+ return candidate;
+}
+
+int erofs_namei(struct inode *dir,
+ struct qstr *name,
+ erofs_nid_t *nid, unsigned int *d_type)
+{
+ int ndirents;
+ struct page *page;
+ void *data;
+ struct erofs_dirent *de;
+ struct erofs_qstr qn;
+
+ if (!dir->i_size)
+ return -ENOENT;
+
+ qn.name = name->name;
+ qn.end = name->name + name->len;
+
+ ndirents = 0;
+ page = find_target_block_classic(dir, &qn, &ndirents);
+
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ data = kmap_atomic(page);
+ /* the target page has been mapped */
+ if (ndirents)
+ de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
+ else
+ de = (struct erofs_dirent *)data;
+
+ if (!IS_ERR(de)) {
+ *nid = le64_to_cpu(de->nid);
+ *d_type = de->file_type;
+ }
+
+ kunmap_atomic(data);
+ put_page(page);
+
+ return PTR_ERR_OR_ZERO(de);
+}
+
+/* NOTE: i_mutex is already held by vfs */
+static struct dentry *erofs_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ int err;
+ erofs_nid_t nid;
+ unsigned int d_type;
+ struct inode *inode;
+
+ DBG_BUGON(!d_really_is_negative(dentry));
+ /* dentry must be unhashed in lookup, no need to worry about */
+ DBG_BUGON(!d_unhashed(dentry));
+
+ trace_erofs_lookup(dir, dentry, flags);
+
+ /* file name exceeds fs limit */
+ if (dentry->d_name.len > EROFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ /* false uninitialized warnings on gcc 4.8.x */
+ err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
+
+ if (err == -ENOENT) {
+ /* negative dentry */
+ inode = NULL;
+ } else if (err) {
+ inode = ERR_PTR(err);
+ } else {
+ erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__,
+ dentry->d_name.name, nid, d_type);
+ inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+ }
+ return d_splice_alias(inode, dentry);
+}
+
+const struct inode_operations erofs_dir_iops = {
+ .lookup = erofs_lookup,
+ .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+ .listxattr = erofs_listxattr,
+#endif
+ .get_acl = erofs_get_acl,
+};
+
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
new file mode 100644
index 000000000000..caf9a95173b0
--- /dev/null
+++ b/fs/erofs/super.c
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "xattr.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/erofs.h>
+
+static struct kmem_cache *erofs_inode_cachep __read_mostly;
+
+void _erofs_err(struct super_block *sb, const char *function,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
+ va_end(args);
+}
+
+void _erofs_info(struct super_block *sb, const char *function,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_info("(device %s): %pV", sb->s_id, &vaf);
+ va_end(args);
+}
+
+static void erofs_inode_init_once(void *ptr)
+{
+ struct erofs_inode *vi = ptr;
+
+ inode_init_once(&vi->vfs_inode);
+}
+
+static struct inode *erofs_alloc_inode(struct super_block *sb)
+{
+ struct erofs_inode *vi =
+ kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+
+ if (!vi)
+ return NULL;
+
+ /* zero out everything except vfs_inode */
+ memset(vi, 0, offsetof(struct erofs_inode, vfs_inode));
+ return &vi->vfs_inode;
+}
+
+static void erofs_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ /* be careful of RCU symlink path */
+ if (inode->i_op == &erofs_fast_symlink_iops)
+ kfree(inode->i_link);
+ kfree(vi->xattr_shared_xattrs);
+
+ kmem_cache_free(erofs_inode_cachep, vi);
+}
+
+static bool check_layout_compatibility(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
+
+ EROFS_SB(sb)->feature_incompat = feature;
+
+ /* check if current kernel meets all mandatory requirements */
+ if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
+ erofs_err(sb,
+ "unidentified incompatible feature %x, please upgrade kernel version",
+ feature & ~EROFS_ALL_FEATURE_INCOMPAT);
+ return false;
+ }
+ return true;
+}
+
+static int erofs_read_superblock(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi;
+ struct page *page;
+ struct erofs_super_block *dsb;
+ unsigned int blkszbits;
+ void *data;
+ int ret;
+
+ page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
+ if (!page) {
+ erofs_err(sb, "cannot read erofs superblock");
+ return -EIO;
+ }
+
+ sbi = EROFS_SB(sb);
+
+ data = kmap_atomic(page);
+ dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
+
+ ret = -EINVAL;
+ if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
+ erofs_err(sb, "cannot find valid erofs superblock");
+ goto out;
+ }
+
+ blkszbits = dsb->blkszbits;
+ /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
+ if (blkszbits != LOG_BLOCK_SIZE) {
+ erofs_err(sb, "blksize %u isn't supported on this platform",
+ 1 << blkszbits);
+ goto out;
+ }
+
+ if (!check_layout_compatibility(sb, dsb))
+ goto out;
+
+ sbi->blocks = le32_to_cpu(dsb->blocks);
+ sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
+#ifdef CONFIG_EROFS_FS_XATTR
+ sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
+#endif
+ sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+ sbi->root_nid = le16_to_cpu(dsb->root_nid);
+ sbi->inos = le64_to_cpu(dsb->inos);
+
+ sbi->build_time = le64_to_cpu(dsb->build_time);
+ sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
+
+ memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+
+ ret = strscpy(sbi->volume_name, dsb->volume_name,
+ sizeof(dsb->volume_name));
+ if (ret < 0) { /* -E2BIG */
+ erofs_err(sb, "bad volume name without NIL terminator");
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+ ret = 0;
+out:
+ kunmap_atomic(data);
+ put_page(page);
+ return ret;
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+static int erofs_build_cache_strategy(struct super_block *sb,
+ substring_t *args)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ const char *cs = match_strdup(args);
+ int err = 0;
+
+ if (!cs) {
+ erofs_err(sb, "Not enough memory to store cache strategy");
+ return -ENOMEM;
+ }
+
+ if (!strcmp(cs, "disabled")) {
+ sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED;
+ } else if (!strcmp(cs, "readahead")) {
+ sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD;
+ } else if (!strcmp(cs, "readaround")) {
+ sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ } else {
+ erofs_err(sb, "Unrecognized cache strategy \"%s\"", cs);
+ err = -EINVAL;
+ }
+ kfree(cs);
+ return err;
+}
+#else
+static int erofs_build_cache_strategy(struct super_block *sb,
+ substring_t *args)
+{
+ erofs_info(sb, "EROFS compression is disabled, so cache strategy is ignored");
+ return 0;
+}
+#endif
+
+/* set up default EROFS parameters */
+static void erofs_default_options(struct erofs_sb_info *sbi)
+{
+#ifdef CONFIG_EROFS_FS_ZIP
+ sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ sbi->max_sync_decompress_pages = 3;
+#endif
+#ifdef CONFIG_EROFS_FS_XATTR
+ set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ set_opt(sbi, POSIX_ACL);
+#endif
+}
+
+enum {
+ Opt_user_xattr,
+ Opt_nouser_xattr,
+ Opt_acl,
+ Opt_noacl,
+ Opt_cache_strategy,
+ Opt_err
+};
+
+static match_table_t erofs_tokens = {
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_cache_strategy, "cache_strategy=%s"},
+ {Opt_err, NULL}
+};
+
+static int erofs_parse_options(struct super_block *sb, char *options)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+ int err;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ","))) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, erofs_tokens, args);
+
+ switch (token) {
+#ifdef CONFIG_EROFS_FS_XATTR
+ case Opt_user_xattr:
+ set_opt(EROFS_SB(sb), XATTR_USER);
+ break;
+ case Opt_nouser_xattr:
+ clear_opt(EROFS_SB(sb), XATTR_USER);
+ break;
+#else
+ case Opt_user_xattr:
+ erofs_info(sb, "user_xattr options not supported");
+ break;
+ case Opt_nouser_xattr:
+ erofs_info(sb, "nouser_xattr options not supported");
+ break;
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ case Opt_acl:
+ set_opt(EROFS_SB(sb), POSIX_ACL);
+ break;
+ case Opt_noacl:
+ clear_opt(EROFS_SB(sb), POSIX_ACL);
+ break;
+#else
+ case Opt_acl:
+ erofs_info(sb, "acl options not supported");
+ break;
+ case Opt_noacl:
+ erofs_info(sb, "noacl options not supported");
+ break;
+#endif
+ case Opt_cache_strategy:
+ err = erofs_build_cache_strategy(sb, args);
+ if (err)
+ return err;
+ break;
+ default:
+ erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+static const struct address_space_operations managed_cache_aops;
+
+static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+ int ret = 1; /* 0 - busy */
+ struct address_space *const mapping = page->mapping;
+
+ DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(mapping->a_ops != &managed_cache_aops);
+
+ if (PagePrivate(page))
+ ret = erofs_try_to_free_cached_page(mapping, page);
+
+ return ret;
+}
+
+static void erofs_managed_cache_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ const unsigned int stop = length + offset;
+
+ DBG_BUGON(!PageLocked(page));
+
+ /* Check for potential overflow in debug mode */
+ DBG_BUGON(stop > PAGE_SIZE || stop < length);
+
+ if (offset == 0 && stop == PAGE_SIZE)
+ while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
+ cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+ .releasepage = erofs_managed_cache_releasepage,
+ .invalidatepage = erofs_managed_cache_invalidatepage,
+};
+
+static int erofs_init_managed_cache(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode)
+ return -ENOMEM;
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+
+ inode->i_mapping->a_ops = &managed_cache_aops;
+ mapping_set_gfp_mask(inode->i_mapping,
+ GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+ sbi->managed_cache = inode;
+ return 0;
+}
+#else
+static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+#endif
+
+static int erofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct inode *inode;
+ struct erofs_sb_info *sbi;
+ int err;
+
+ sb->s_magic = EROFS_SUPER_MAGIC;
+
+ if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+ erofs_err(sb, "failed to set erofs blksize");
+ return -EINVAL;
+ }
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ sb->s_fs_info = sbi;
+ err = erofs_read_superblock(sb);
+ if (err)
+ return err;
+
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+
+ sb->s_op = &erofs_sops;
+
+#ifdef CONFIG_EROFS_FS_XATTR
+ sb->s_xattr = erofs_xattr_handlers;
+#endif
+ /* set erofs default mount options */
+ erofs_default_options(sbi);
+
+ err = erofs_parse_options(sb, data);
+ if (err)
+ return err;
+
+ if (test_opt(sbi, POSIX_ACL))
+ sb->s_flags |= SB_POSIXACL;
+ else
+ sb->s_flags &= ~SB_POSIXACL;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+ INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+#endif
+
+ /* get the root inode */
+ inode = erofs_iget(sb, ROOT_NID(sbi), true);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (!S_ISDIR(inode->i_mode)) {
+ erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
+ ROOT_NID(sbi), inode->i_mode);
+ iput(inode);
+ return -EINVAL;
+ }
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ erofs_shrinker_register(sb);
+ /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
+ err = erofs_init_managed_cache(sb);
+ if (err)
+ return err;
+
+ erofs_info(sb, "mounted with opts: %s, root inode @ nid %llu.",
+ (char *)data, ROOT_NID(sbi));
+ return 0;
+}
+
+static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super);
+}
+
+/*
+ * could be triggered after deactivate_locked_super()
+ * is called, thus including umount and failed to initialize.
+ */
+static void erofs_kill_sb(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi;
+
+ WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
+
+ kill_block_super(sb);
+
+ sbi = EROFS_SB(sb);
+ if (!sbi)
+ return;
+ kfree(sbi);
+ sb->s_fs_info = NULL;
+}
+
+/* called when ->s_root is non-NULL */
+static void erofs_put_super(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ DBG_BUGON(!sbi);
+
+ erofs_shrinker_unregister(sb);
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+}
+
+static struct file_system_type erofs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "erofs",
+ .mount = erofs_mount,
+ .kill_sb = erofs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("erofs");
+
+static int __init erofs_module_init(void)
+{
+ int err;
+
+ erofs_check_ondisk_layout_definitions();
+
+ erofs_inode_cachep = kmem_cache_create("erofs_inode",
+ sizeof(struct erofs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT,
+ erofs_inode_init_once);
+ if (!erofs_inode_cachep) {
+ err = -ENOMEM;
+ goto icache_err;
+ }
+
+ err = erofs_init_shrinker();
+ if (err)
+ goto shrinker_err;
+
+ err = z_erofs_init_zip_subsystem();
+ if (err)
+ goto zip_err;
+
+ err = register_filesystem(&erofs_fs_type);
+ if (err)
+ goto fs_err;
+
+ return 0;
+
+fs_err:
+ z_erofs_exit_zip_subsystem();
+zip_err:
+ erofs_exit_shrinker();
+shrinker_err:
+ kmem_cache_destroy(erofs_inode_cachep);
+icache_err:
+ return err;
+}
+
+static void __exit erofs_module_exit(void)
+{
+ unregister_filesystem(&erofs_fs_type);
+ z_erofs_exit_zip_subsystem();
+ erofs_exit_shrinker();
+
+ /* Ensure all RCU free inodes are safe before cache is destroyed. */
+ rcu_barrier();
+ kmem_cache_destroy(erofs_inode_cachep);
+}
+
+/* get filesystem statistics */
+static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = EROFS_BLKSIZ;
+ buf->f_blocks = sbi->blocks;
+ buf->f_bfree = buf->f_bavail = 0;
+
+ buf->f_files = ULLONG_MAX;
+ buf->f_ffree = ULLONG_MAX - sbi->inos;
+
+ buf->f_namelen = EROFS_NAME_LEN;
+
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
+ return 0;
+}
+
+static int erofs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
+
+#ifdef CONFIG_EROFS_FS_XATTR
+ if (test_opt(sbi, XATTR_USER))
+ seq_puts(seq, ",user_xattr");
+ else
+ seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ if (test_opt(sbi, POSIX_ACL))
+ seq_puts(seq, ",acl");
+ else
+ seq_puts(seq, ",noacl");
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP
+ if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) {
+ seq_puts(seq, ",cache_strategy=disabled");
+ } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) {
+ seq_puts(seq, ",cache_strategy=readahead");
+ } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
+ seq_puts(seq, ",cache_strategy=readaround");
+ } else {
+ seq_puts(seq, ",cache_strategy=(unknown)");
+ DBG_BUGON(1);
+ }
+#endif
+ return 0;
+}
+
+static int erofs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int org_mnt_opt = sbi->mount_opt;
+ int err;
+
+ DBG_BUGON(!sb_rdonly(sb));
+ err = erofs_parse_options(sb, data);
+ if (err)
+ goto out;
+
+ if (test_opt(sbi, POSIX_ACL))
+ sb->s_flags |= SB_POSIXACL;
+ else
+ sb->s_flags &= ~SB_POSIXACL;
+
+ *flags |= SB_RDONLY;
+ return 0;
+out:
+ sbi->mount_opt = org_mnt_opt;
+ return err;
+}
+
+const struct super_operations erofs_sops = {
+ .put_super = erofs_put_super,
+ .alloc_inode = erofs_alloc_inode,
+ .free_inode = erofs_free_inode,
+ .statfs = erofs_statfs,
+ .show_options = erofs_show_options,
+ .remount_fs = erofs_remount,
+};
+
+module_init(erofs_module_init);
+module_exit(erofs_module_exit);
+
+MODULE_DESCRIPTION("Enhanced ROM File System");
+MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
new file mode 100644
index 000000000000..a72897c86744
--- /dev/null
+++ b/fs/erofs/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_TAGPTR_H
+#define __EROFS_FS_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n { \
+ uintptr_t v; \
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+ __bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+ __bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n) \
+ __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+ (1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr) (\
+ __tagptr_mask_1(ptr, 1) ( \
+ __tagptr_mask_1(ptr, 2) ( \
+ __tagptr_mask_1(ptr, 3) ( \
+ __tagptr_mask_1(ptr, 4) ( \
+ __bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+ ((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+ const typeof(_tags) tags = (_tags); \
+ if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+ __bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+ ((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+ ((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+ typeof(_tptr1) tptr1 = (_tptr1); \
+ typeof(_tptr2) tptr2 = (_tptr2); \
+ (void)(&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+ typeof(_ptptr) ptptr = (_ptptr); \
+ typeof(_o) o = (_o); \
+ typeof(_n) n = (_n); \
+ (void)(&o == &n); \
+ (void)(&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+ typeof(_ptptr) ptptr = (_ptptr); \
+ *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+ typeof(_ptptr) ptptr = (_ptptr); \
+ const typeof(_tags) tags = (_tags); \
+ if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+ __bad_tagptr_tags(); \
+ ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+ typeof(_ptptr) ptptr = (_ptptr); \
+ const typeof(_tags) tags = (_tags); \
+ if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+ __bad_tagptr_tags(); \
+ ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif /* __EROFS_FS_TAGPTR_H */
+
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 000000000000..d92b3e753a6f
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/pagevec.h>
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
+{
+ struct page *page;
+
+ if (!list_empty(pool)) {
+ page = lru_to_page(pool);
+ DBG_BUGON(page_ref_count(page) != 1);
+ list_del(&page->lru);
+ } else {
+ page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0);
+ }
+ return page;
+}
+
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+static struct {
+ u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
+} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
+
+void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+ preempt_disable();
+ return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
+}
+#endif
+
+#ifdef CONFIG_EROFS_FS_ZIP
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount)
+#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)
+
+static int erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+ int o;
+
+repeat:
+ o = erofs_wait_on_workgroup_freezed(grp);
+ if (o <= 0)
+ return -1;
+
+ if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
+ goto repeat;
+
+ /* decrease refcount paired by erofs_workgroup_put */
+ if (o == 1)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ return 0;
+}
+
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+ pgoff_t index, bool *tag)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_workgroup *grp;
+
+repeat:
+ rcu_read_lock();
+ grp = radix_tree_lookup(&sbi->workstn_tree, index);
+ if (grp) {
+ *tag = xa_pointer_tag(grp);
+ grp = xa_untag_pointer(grp);
+
+ if (erofs_workgroup_get(grp)) {
+ /* prefer to relax rcu read side */
+ rcu_read_unlock();
+ goto repeat;
+ }
+
+ DBG_BUGON(index != grp->index);
+ }
+ rcu_read_unlock();
+ return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp,
+ bool tag)
+{
+ struct erofs_sb_info *sbi;
+ int err;
+
+ /* grp shouldn't be broken or used before */
+ if (atomic_read(&grp->refcount) != 1) {
+ DBG_BUGON(1);
+ return -EINVAL;
+ }
+
+ err = radix_tree_preload(GFP_NOFS);
+ if (err)
+ return err;
+
+ sbi = EROFS_SB(sb);
+ xa_lock(&sbi->workstn_tree);
+
+ grp = xa_tag_pointer(grp, tag);
+
+ /*
+ * Bump up reference count before making this workgroup
+ * visible to other users in order to avoid potential UAF
+ * without serialized by workstn_lock.
+ */
+ __erofs_workgroup_get(grp);
+
+ err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
+ if (err)
+ /*
+ * it's safe to decrease since the workgroup isn't visible
+ * and refcount >= 2 (cannot be freezed).
+ */
+ __erofs_workgroup_put(grp);
+
+ xa_unlock(&sbi->workstn_tree);
+ radix_tree_preload_end();
+ return err;
+}
+
+static void __erofs_workgroup_free(struct erofs_workgroup *grp)
+{
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ erofs_workgroup_free_rcu(grp);
+}
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+ int count = atomic_dec_return(&grp->refcount);
+
+ if (count == 1)
+ atomic_long_inc(&erofs_global_shrink_cnt);
+ else if (!count)
+ __erofs_workgroup_free(grp);
+ return count;
+}
+
+static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
+{
+ erofs_workgroup_unfreeze(grp, 0);
+ __erofs_workgroup_free(grp);
+}
+
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp,
+ bool cleanup)
+{
+ /*
+ * If managed cache is on, refcount of workgroups
+ * themselves could be < 0 (freezed). In other words,
+ * there is no guarantee that all refcounts > 0.
+ */
+ if (!erofs_workgroup_try_to_freeze(grp, 1))
+ return false;
+
+ /*
+ * Note that all cached pages should be unattached
+ * before deleted from the radix tree. Otherwise some
+ * cached pages could be still attached to the orphan
+ * old workgroup when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
+ erofs_workgroup_unfreeze(grp, 1);
+ return false;
+ }
+
+ /*
+ * It's impossible to fail after the workgroup is freezed,
+ * however in order to avoid some race conditions, add a
+ * DBG_BUGON to observe this in advance.
+ */
+ DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
+ grp->index)) != grp);
+
+ /*
+ * If managed cache is on, last refcount should indicate
+ * the related workstation.
+ */
+ erofs_workgroup_unfreeze_final(grp);
+ return true;
+}
+
+static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink,
+ bool cleanup)
+{
+ pgoff_t first_index = 0;
+ void *batch[PAGEVEC_SIZE];
+ unsigned int freed = 0;
+
+ int i, found;
+repeat:
+ xa_lock(&sbi->workstn_tree);
+
+ found = radix_tree_gang_lookup(&sbi->workstn_tree,
+ batch, first_index, PAGEVEC_SIZE);
+
+ for (i = 0; i < found; ++i) {
+ struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
+
+ first_index = grp->index + 1;
+
+ /* try to shrink each valid workgroup */
+ if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+ continue;
+
+ ++freed;
+ if (!--nr_shrink)
+ break;
+ }
+ xa_unlock(&sbi->workstn_tree);
+
+ if (i && nr_shrink)
+ goto repeat;
+ return freed;
+}
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_shrinker_register(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_init(&sbi->umount_mutex);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_add(&sbi->list, &erofs_sb_list);
+ spin_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+ mutex_lock(&sbi->umount_mutex);
+ erofs_shrink_workstation(sbi, ~0UL, true);
+
+ spin_lock(&erofs_sb_list_lock);
+ list_del(&sbi->list);
+ spin_unlock(&erofs_sb_list_lock);
+ mutex_unlock(&sbi->umount_mutex);
+}
+
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct erofs_sb_info *sbi;
+ struct list_head *p;
+
+ unsigned long nr = sc->nr_to_scan;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&erofs_sb_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+
+ /* Iterate over all mounted superblocks and try to shrink them */
+ p = erofs_sb_list.next;
+ while (p != &erofs_sb_list) {
+ sbi = list_entry(p, struct erofs_sb_info, list);
+
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+
+ spin_unlock(&erofs_sb_list_lock);
+ sbi->shrinker_run_no = run_no;
+
+ freed += erofs_shrink_workstation(sbi, nr, false);
+
+ spin_lock(&erofs_sb_list_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_move_tail(&sbi->list, &erofs_sb_list);
+ mutex_unlock(&sbi->umount_mutex);
+
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&erofs_sb_list_lock);
+ return freed;
+}
+
+static struct shrinker erofs_shrinker_info = {
+ .scan_objects = erofs_shrink_scan,
+ .count_objects = erofs_shrink_count,
+ .seeks = DEFAULT_SEEKS,
+};
+
+int __init erofs_init_shrinker(void)
+{
+ return register_shrinker(&erofs_shrinker_info);
+}
+
+void erofs_exit_shrinker(void)
+{
+ unregister_shrinker(&erofs_shrinker_info);
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
new file mode 100644
index 000000000000..a13a78725c57
--- /dev/null
+++ b/fs/erofs/xattr.c
@@ -0,0 +1,704 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include <linux/security.h>
+#include "xattr.h"
+
+struct xattr_iter {
+ struct super_block *sb;
+ struct page *page;
+ void *kaddr;
+
+ erofs_blk_t blkaddr;
+ unsigned int ofs;
+};
+
+static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
+{
+ /* the only user of kunmap() is 'init_inode_xattrs' */
+ if (!atomic)
+ kunmap(it->page);
+ else
+ kunmap_atomic(it->kaddr);
+
+ unlock_page(it->page);
+ put_page(it->page);
+}
+
+static inline void xattr_iter_end_final(struct xattr_iter *it)
+{
+ if (!it->page)
+ return;
+
+ xattr_iter_end(it, true);
+}
+
+static int init_inode_xattrs(struct inode *inode)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct xattr_iter it;
+ unsigned int i;
+ struct erofs_xattr_ibody_header *ih;
+ struct super_block *sb;
+ struct erofs_sb_info *sbi;
+ bool atomic_map;
+ int ret = 0;
+
+ /* the most case is that xattrs of this inode are initialized. */
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+ return 0;
+
+ if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
+ return -ERESTARTSYS;
+
+ /* someone has initialized xattrs for us? */
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+ goto out_unlock;
+
+ /*
+ * bypass all xattr operations if ->xattr_isize is not greater than
+ * sizeof(struct erofs_xattr_ibody_header), in detail:
+ * 1) it is not enough to contain erofs_xattr_ibody_header then
+ * ->xattr_isize should be 0 (it means no xattr);
+ * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk
+ * undefined right now (maybe use later with some new sb feature).
+ */
+ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
+ erofs_err(inode->i_sb,
+ "xattr_isize %d of nid %llu is not supported yet",
+ vi->xattr_isize, vi->nid);
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
+ if (vi->xattr_isize) {
+ erofs_err(inode->i_sb,
+ "bogus xattr ibody @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ ret = -EFSCORRUPTED;
+ goto out_unlock; /* xattr ondisk layout error */
+ }
+ ret = -ENOATTR;
+ goto out_unlock;
+ }
+
+ sb = inode->i_sb;
+ sbi = EROFS_SB(sb);
+ it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
+ it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
+
+ it.page = erofs_get_meta_page(sb, it.blkaddr);
+ if (IS_ERR(it.page)) {
+ ret = PTR_ERR(it.page);
+ goto out_unlock;
+ }
+
+ /* read in shared xattr array (non-atomic, see kmalloc below) */
+ it.kaddr = kmap(it.page);
+ atomic_map = false;
+
+ ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
+
+ vi->xattr_shared_count = ih->h_shared_count;
+ vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
+ sizeof(uint), GFP_KERNEL);
+ if (!vi->xattr_shared_xattrs) {
+ xattr_iter_end(&it, atomic_map);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /* let's skip ibody header */
+ it.ofs += sizeof(struct erofs_xattr_ibody_header);
+
+ for (i = 0; i < vi->xattr_shared_count; ++i) {
+ if (it.ofs >= EROFS_BLKSIZ) {
+ /* cannot be unaligned */
+ DBG_BUGON(it.ofs != EROFS_BLKSIZ);
+ xattr_iter_end(&it, atomic_map);
+
+ it.page = erofs_get_meta_page(sb, ++it.blkaddr);
+ if (IS_ERR(it.page)) {
+ kfree(vi->xattr_shared_xattrs);
+ vi->xattr_shared_xattrs = NULL;
+ ret = PTR_ERR(it.page);
+ goto out_unlock;
+ }
+
+ it.kaddr = kmap_atomic(it.page);
+ atomic_map = true;
+ it.ofs = 0;
+ }
+ vi->xattr_shared_xattrs[i] =
+ le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
+ it.ofs += sizeof(__le32);
+ }
+ xattr_iter_end(&it, atomic_map);
+
+ set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
+
+out_unlock:
+ clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags);
+ return ret;
+}
+
+/*
+ * the general idea for these return values is
+ * if 0 is returned, go on processing the current xattr;
+ * 1 (> 0) is returned, skip this round to process the next xattr;
+ * -err (< 0) is returned, an error (maybe ENOXATTR) occurred
+ * and need to be handled
+ */
+struct xattr_iter_handlers {
+ int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
+ int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
+ unsigned int len);
+ int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
+ void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
+ unsigned int len);
+};
+
+static inline int xattr_iter_fixup(struct xattr_iter *it)
+{
+ if (it->ofs < EROFS_BLKSIZ)
+ return 0;
+
+ xattr_iter_end(it, true);
+
+ it->blkaddr += erofs_blknr(it->ofs);
+
+ it->page = erofs_get_meta_page(it->sb, it->blkaddr);
+ if (IS_ERR(it->page)) {
+ int err = PTR_ERR(it->page);
+
+ it->page = NULL;
+ return err;
+ }
+
+ it->kaddr = kmap_atomic(it->page);
+ it->ofs = erofs_blkoff(it->ofs);
+ return 0;
+}
+
+static int inline_xattr_iter_begin(struct xattr_iter *it,
+ struct inode *inode)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
+ unsigned int xattr_header_sz, inline_xattr_ofs;
+
+ xattr_header_sz = inlinexattr_header_size(inode);
+ if (xattr_header_sz >= vi->xattr_isize) {
+ DBG_BUGON(xattr_header_sz > vi->xattr_isize);
+ return -ENOATTR;
+ }
+
+ inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
+
+ it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
+ it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
+
+ it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr);
+ if (IS_ERR(it->page))
+ return PTR_ERR(it->page);
+
+ it->kaddr = kmap_atomic(it->page);
+ return vi->xattr_isize - xattr_header_sz;
+}
+
+/*
+ * Regardless of success or failure, `xattr_foreach' will end up with
+ * `ofs' pointing to the next xattr item rather than an arbitrary position.
+ */
+static int xattr_foreach(struct xattr_iter *it,
+ const struct xattr_iter_handlers *op,
+ unsigned int *tlimit)
+{
+ struct erofs_xattr_entry entry;
+ unsigned int value_sz, processed, slice;
+ int err;
+
+ /* 0. fixup blkaddr, ofs, ipage */
+ err = xattr_iter_fixup(it);
+ if (err)
+ return err;
+
+ /*
+ * 1. read xattr entry to the memory,
+ * since we do EROFS_XATTR_ALIGN
+ * therefore entry should be in the page
+ */
+ entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
+ if (tlimit) {
+ unsigned int entry_sz = erofs_xattr_entry_size(&entry);
+
+ /* xattr on-disk corruption: xattr entry beyond xattr_isize */
+ if (*tlimit < entry_sz) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ *tlimit -= entry_sz;
+ }
+
+ it->ofs += sizeof(struct erofs_xattr_entry);
+ value_sz = le16_to_cpu(entry.e_value_size);
+
+ /* handle entry */
+ err = op->entry(it, &entry);
+ if (err) {
+ it->ofs += entry.e_name_len + value_sz;
+ goto out;
+ }
+
+ /* 2. handle xattr name (ofs will finally be at the end of name) */
+ processed = 0;
+
+ while (processed < entry.e_name_len) {
+ if (it->ofs >= EROFS_BLKSIZ) {
+ DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+
+ err = xattr_iter_fixup(it);
+ if (err)
+ goto out;
+ it->ofs = 0;
+ }
+
+ slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+ entry.e_name_len - processed);
+
+ /* handle name */
+ err = op->name(it, processed, it->kaddr + it->ofs, slice);
+ if (err) {
+ it->ofs += entry.e_name_len - processed + value_sz;
+ goto out;
+ }
+
+ it->ofs += slice;
+ processed += slice;
+ }
+
+ /* 3. handle xattr value */
+ processed = 0;
+
+ if (op->alloc_buffer) {
+ err = op->alloc_buffer(it, value_sz);
+ if (err) {
+ it->ofs += value_sz;
+ goto out;
+ }
+ }
+
+ while (processed < value_sz) {
+ if (it->ofs >= EROFS_BLKSIZ) {
+ DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+
+ err = xattr_iter_fixup(it);
+ if (err)
+ goto out;
+ it->ofs = 0;
+ }
+
+ slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+ value_sz - processed);
+ op->value(it, processed, it->kaddr + it->ofs, slice);
+ it->ofs += slice;
+ processed += slice;
+ }
+
+out:
+ /* xattrs should be 4-byte aligned (on-disk constraint) */
+ it->ofs = EROFS_XATTR_ALIGN(it->ofs);
+ return err < 0 ? err : 0;
+}
+
+struct getxattr_iter {
+ struct xattr_iter it;
+
+ char *buffer;
+ int buffer_size, index;
+ struct qstr name;
+};
+
+static int xattr_entrymatch(struct xattr_iter *_it,
+ struct erofs_xattr_entry *entry)
+{
+ struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+ return (it->index != entry->e_name_index ||
+ it->name.len != entry->e_name_len) ? -ENOATTR : 0;
+}
+
+static int xattr_namematch(struct xattr_iter *_it,
+ unsigned int processed, char *buf, unsigned int len)
+{
+ struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+ return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0;
+}
+
+static int xattr_checkbuffer(struct xattr_iter *_it,
+ unsigned int value_sz)
+{
+ struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+ int err = it->buffer_size < value_sz ? -ERANGE : 0;
+
+ it->buffer_size = value_sz;
+ return !it->buffer ? 1 : err;
+}
+
+static void xattr_copyvalue(struct xattr_iter *_it,
+ unsigned int processed,
+ char *buf, unsigned int len)
+{
+ struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+ memcpy(it->buffer + processed, buf, len);
+}
+
+static const struct xattr_iter_handlers find_xattr_handlers = {
+ .entry = xattr_entrymatch,
+ .name = xattr_namematch,
+ .alloc_buffer = xattr_checkbuffer,
+ .value = xattr_copyvalue
+};
+
+static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+ int ret;
+ unsigned int remaining;
+
+ ret = inline_xattr_iter_begin(&it->it, inode);
+ if (ret < 0)
+ return ret;
+
+ remaining = ret;
+ while (remaining) {
+ ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
+ if (ret != -ENOATTR)
+ break;
+ }
+ xattr_iter_end_final(&it->it);
+
+ return ret ? ret : it->buffer_size;
+}
+
+static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = inode->i_sb;
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ unsigned int i;
+ int ret = -ENOATTR;
+
+ for (i = 0; i < vi->xattr_shared_count; ++i) {
+ erofs_blk_t blkaddr =
+ xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+
+ it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+
+ if (!i || blkaddr != it->it.blkaddr) {
+ if (i)
+ xattr_iter_end(&it->it, true);
+
+ it->it.page = erofs_get_meta_page(sb, blkaddr);
+ if (IS_ERR(it->it.page))
+ return PTR_ERR(it->it.page);
+
+ it->it.kaddr = kmap_atomic(it->it.page);
+ it->it.blkaddr = blkaddr;
+ }
+
+ ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
+ if (ret != -ENOATTR)
+ break;
+ }
+ if (vi->xattr_shared_count)
+ xattr_iter_end_final(&it->it);
+
+ return ret ? ret : it->buffer_size;
+}
+
+static bool erofs_xattr_user_list(struct dentry *dentry)
+{
+ return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER);
+}
+
+static bool erofs_xattr_trusted_list(struct dentry *dentry)
+{
+ return capable(CAP_SYS_ADMIN);
+}
+
+int erofs_getxattr(struct inode *inode, int index,
+ const char *name,
+ void *buffer, size_t buffer_size)
+{
+ int ret;
+ struct getxattr_iter it;
+
+ if (!name)
+ return -EINVAL;
+
+ ret = init_inode_xattrs(inode);
+ if (ret)
+ return ret;
+
+ it.index = index;
+
+ it.name.len = strlen(name);
+ if (it.name.len > EROFS_NAME_LEN)
+ return -ERANGE;
+ it.name.name = name;
+
+ it.buffer = buffer;
+ it.buffer_size = buffer_size;
+
+ it.it.sb = inode->i_sb;
+ ret = inline_getxattr(inode, &it);
+ if (ret == -ENOATTR)
+ ret = shared_getxattr(inode, &it);
+ return ret;
+}
+
+static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+
+ switch (handler->flags) {
+ case EROFS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ break;
+ case EROFS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ break;
+ case EROFS_XATTR_INDEX_SECURITY:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return erofs_getxattr(inode, handler->flags, name, buffer, size);
+}
+
+const struct xattr_handler erofs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .flags = EROFS_XATTR_INDEX_USER,
+ .list = erofs_xattr_user_list,
+ .get = erofs_xattr_generic_get,
+};
+
+const struct xattr_handler erofs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .flags = EROFS_XATTR_INDEX_TRUSTED,
+ .list = erofs_xattr_trusted_list,
+ .get = erofs_xattr_generic_get,
+};
+
+#ifdef CONFIG_EROFS_FS_SECURITY
+const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = EROFS_XATTR_INDEX_SECURITY,
+ .get = erofs_xattr_generic_get,
+};
+#endif
+
+const struct xattr_handler *erofs_xattr_handlers[] = {
+ &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+ &erofs_xattr_security_handler,
+#endif
+ NULL,
+};
+
+struct listxattr_iter {
+ struct xattr_iter it;
+
+ struct dentry *dentry;
+ char *buffer;
+ int buffer_size, buffer_ofs;
+};
+
+static int xattr_entrylist(struct xattr_iter *_it,
+ struct erofs_xattr_entry *entry)
+{
+ struct listxattr_iter *it =
+ container_of(_it, struct listxattr_iter, it);
+ unsigned int prefix_len;
+ const char *prefix;
+
+ const struct xattr_handler *h =
+ erofs_xattr_handler(entry->e_name_index);
+
+ if (!h || (h->list && !h->list(it->dentry)))
+ return 1;
+
+ prefix = xattr_prefix(h);
+ prefix_len = strlen(prefix);
+
+ if (!it->buffer) {
+ it->buffer_ofs += prefix_len + entry->e_name_len + 1;
+ return 1;
+ }
+
+ if (it->buffer_ofs + prefix_len
+ + entry->e_name_len + 1 > it->buffer_size)
+ return -ERANGE;
+
+ memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
+ it->buffer_ofs += prefix_len;
+ return 0;
+}
+
+static int xattr_namelist(struct xattr_iter *_it,
+ unsigned int processed, char *buf, unsigned int len)
+{
+ struct listxattr_iter *it =
+ container_of(_it, struct listxattr_iter, it);
+
+ memcpy(it->buffer + it->buffer_ofs, buf, len);
+ it->buffer_ofs += len;
+ return 0;
+}
+
+static int xattr_skipvalue(struct xattr_iter *_it,
+ unsigned int value_sz)
+{
+ struct listxattr_iter *it =
+ container_of(_it, struct listxattr_iter, it);
+
+ it->buffer[it->buffer_ofs++] = '\0';
+ return 1;
+}
+
+static const struct xattr_iter_handlers list_xattr_handlers = {
+ .entry = xattr_entrylist,
+ .name = xattr_namelist,
+ .alloc_buffer = xattr_skipvalue,
+ .value = NULL
+};
+
+static int inline_listxattr(struct listxattr_iter *it)
+{
+ int ret;
+ unsigned int remaining;
+
+ ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
+ if (ret < 0)
+ return ret;
+
+ remaining = ret;
+ while (remaining) {
+ ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
+ if (ret)
+ break;
+ }
+ xattr_iter_end_final(&it->it);
+ return ret ? ret : it->buffer_ofs;
+}
+
+static int shared_listxattr(struct listxattr_iter *it)
+{
+ struct inode *const inode = d_inode(it->dentry);
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = inode->i_sb;
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < vi->xattr_shared_count; ++i) {
+ erofs_blk_t blkaddr =
+ xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+
+ it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+ if (!i || blkaddr != it->it.blkaddr) {
+ if (i)
+ xattr_iter_end(&it->it, true);
+
+ it->it.page = erofs_get_meta_page(sb, blkaddr);
+ if (IS_ERR(it->it.page))
+ return PTR_ERR(it->it.page);
+
+ it->it.kaddr = kmap_atomic(it->it.page);
+ it->it.blkaddr = blkaddr;
+ }
+
+ ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
+ if (ret)
+ break;
+ }
+ if (vi->xattr_shared_count)
+ xattr_iter_end_final(&it->it);
+
+ return ret ? ret : it->buffer_ofs;
+}
+
+ssize_t erofs_listxattr(struct dentry *dentry,
+ char *buffer, size_t buffer_size)
+{
+ int ret;
+ struct listxattr_iter it;
+
+ ret = init_inode_xattrs(d_inode(dentry));
+ if (ret)
+ return ret;
+
+ it.dentry = dentry;
+ it.buffer = buffer;
+ it.buffer_size = buffer_size;
+ it.buffer_ofs = 0;
+
+ it.it.sb = dentry->d_sb;
+
+ ret = inline_listxattr(&it);
+ if (ret < 0 && ret != -ENOATTR)
+ return ret;
+ return shared_listxattr(&it);
+}
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl;
+ int prefix, rc;
+ char *value = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ rc = erofs_getxattr(inode, prefix, "", NULL, 0);
+ if (rc > 0) {
+ value = kmalloc(rc, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ rc = erofs_getxattr(inode, prefix, "", value, rc);
+ }
+
+ if (rc == -ENOATTR)
+ acl = NULL;
+ else if (rc < 0)
+ acl = ERR_PTR(rc);
+ else
+ acl = posix_acl_from_xattr(&init_user_ns, value, rc);
+ kfree(value);
+ return acl;
+}
+#endif
+
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
new file mode 100644
index 000000000000..3585b84d2f20
--- /dev/null
+++ b/fs/erofs/xattr.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_XATTR_H
+#define __EROFS_XATTR_H
+
+#include "internal.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Attribute not found */
+#define ENOATTR ENODATA
+
+static inline unsigned int inlinexattr_header_size(struct inode *inode)
+{
+ return sizeof(struct erofs_xattr_ibody_header) +
+ sizeof(u32) * EROFS_I(inode)->xattr_shared_count;
+}
+
+static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
+ unsigned int xattr_id)
+{
+#ifdef CONFIG_EROFS_FS_XATTR
+ return sbi->xattr_blkaddr +
+ xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
+ unsigned int xattr_id)
+{
+ return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
+}
+
+#ifdef CONFIG_EROFS_FS_XATTR
+extern const struct xattr_handler erofs_xattr_user_handler;
+extern const struct xattr_handler erofs_xattr_trusted_handler;
+#ifdef CONFIG_EROFS_FS_SECURITY
+extern const struct xattr_handler erofs_xattr_security_handler;
+#endif
+
+static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
+{
+static const struct xattr_handler *xattr_handler_map[] = {
+ [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+ [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+ &posix_acl_default_xattr_handler,
+#endif
+ [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+ [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+#endif
+};
+
+ return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
+ xattr_handler_map[idx] : NULL;
+}
+
+extern const struct xattr_handler *erofs_xattr_handlers[];
+
+int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
+ssize_t erofs_listxattr(struct dentry *, char *, size_t);
+#else
+static inline int erofs_getxattr(struct inode *inode, int index,
+ const char *name, void *buffer,
+ size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline ssize_t erofs_listxattr(struct dentry *dentry,
+ char *buffer, size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* !CONFIG_EROFS_FS_XATTR */
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type);
+#else
+#define erofs_get_acl (NULL)
+#endif
+
+#endif
+
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
new file mode 100644
index 000000000000..96e34c90f814
--- /dev/null
+++ b/fs/erofs/zdata.c
@@ -0,0 +1,1431 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "zdata.h"
+#include "compress.h"
+#include <linux/prefetch.h>
+
+#include <trace/events/erofs.h>
+
+/*
+ * a compressed_pages[] placeholder in order to avoid
+ * being filled with file pages for in-place decompression.
+ */
+#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
+
+/* how to allocate cached pages for a pcluster */
+enum z_erofs_cache_alloctype {
+ DONTALLOC, /* don't allocate any cached pages */
+ DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */
+};
+
+/*
+ * tagged pointer with 1-bit tag for all compressed pages
+ * tag 0 - the page is just found with an extra page reference
+ */
+typedef tagptr1_t compressed_page_t;
+
+#define tag_compressed_page_justfound(page) \
+ tagptr_fold(compressed_page_t, page, 1)
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *pcluster_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+ destroy_workqueue(z_erofs_workqueue);
+ kmem_cache_destroy(pcluster_cachep);
+}
+
+static inline int z_erofs_init_workqueue(void)
+{
+ const unsigned int onlinecpus = num_possible_cpus();
+ const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE;
+
+ /*
+ * no need to spawn too many threads, limiting threads could minimum
+ * scheduling overhead, perhaps per-CPU threads should be better?
+ */
+ z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags,
+ onlinecpus + onlinecpus / 4);
+ return z_erofs_workqueue ? 0 : -ENOMEM;
+}
+
+static void z_erofs_pcluster_init_once(void *ptr)
+{
+ struct z_erofs_pcluster *pcl = ptr;
+ struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+ unsigned int i;
+
+ mutex_init(&cl->lock);
+ cl->nr_pages = 0;
+ cl->vcnt = 0;
+ for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
+ pcl->compressed_pages[i] = NULL;
+}
+
+static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
+{
+ struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+
+ atomic_set(&pcl->obj.refcount, 1);
+
+ DBG_BUGON(cl->nr_pages);
+ DBG_BUGON(cl->vcnt);
+}
+
+int __init z_erofs_init_zip_subsystem(void)
+{
+ pcluster_cachep = kmem_cache_create("erofs_compress",
+ Z_EROFS_WORKGROUP_SIZE, 0,
+ SLAB_RECLAIM_ACCOUNT,
+ z_erofs_pcluster_init_once);
+ if (pcluster_cachep) {
+ if (!z_erofs_init_workqueue())
+ return 0;
+
+ kmem_cache_destroy(pcluster_cachep);
+ }
+ return -ENOMEM;
+}
+
+enum z_erofs_collectmode {
+ COLLECT_SECONDARY,
+ COLLECT_PRIMARY,
+ /*
+ * The current collection was the tail of an exist chain, in addition
+ * that the previous processed chained collections are all decided to
+ * be hooked up to it.
+ * A new chain will be created for the remaining collections which are
+ * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
+ * the next collection cannot reuse the whole page safely in
+ * the following scenario:
+ * ________________________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (belongs to the next cl) | (belongs to the current cl) |
+ * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+ */
+ COLLECT_PRIMARY_HOOKED,
+ COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+ /*
+ * The current collection has been linked with the owned chain, and
+ * could also be linked with the remaining collections, which means
+ * if the processing page is the tail page of the collection, thus
+ * the current collection can safely use the whole page (since
+ * the previous collection is under control) for in-place I/O, as
+ * illustrated below:
+ * ________________________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (of the current cl) | (of the previous collection) |
+ * | PRIMARY_FOLLOWED or | |
+ * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+ *
+ * [ (*) the above page can be used as inplace I/O. ]
+ */
+ COLLECT_PRIMARY_FOLLOWED,
+};
+
+struct z_erofs_collector {
+ struct z_erofs_pagevec_ctor vector;
+
+ struct z_erofs_pcluster *pcl, *tailpcl;
+ struct z_erofs_collection *cl;
+ struct page **compressedpages;
+ z_erofs_next_pcluster_t owned_head;
+
+ enum z_erofs_collectmode mode;
+};
+
+struct z_erofs_decompress_frontend {
+ struct inode *const inode;
+
+ struct z_erofs_collector clt;
+ struct erofs_map_blocks map;
+
+ /* used for applying cache strategy on the fly */
+ bool backmost;
+ erofs_off_t headoffset;
+};
+
+#define COLLECTOR_INIT() { \
+ .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = COLLECT_PRIMARY_FOLLOWED }
+
+#define DECOMPRESS_FRONTEND_INIT(__i) { \
+ .inode = __i, .clt = COLLECTOR_INIT(), \
+ .backmost = true, }
+
+static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static void preload_compressed_pages(struct z_erofs_collector *clt,
+ struct address_space *mc,
+ enum z_erofs_cache_alloctype type,
+ struct list_head *pagepool)
+{
+ const struct z_erofs_pcluster *pcl = clt->pcl;
+ const unsigned int clusterpages = BIT(pcl->clusterbits);
+ struct page **pages = clt->compressedpages;
+ pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+ bool standalone = true;
+
+ if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+ return;
+
+ for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+ struct page *page;
+ compressed_page_t t;
+
+ /* the compressed page was loaded before */
+ if (READ_ONCE(*pages))
+ continue;
+
+ page = find_get_page(mc, index);
+
+ if (page) {
+ t = tag_compressed_page_justfound(page);
+ } else if (type == DELAYEDALLOC) {
+ t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
+ } else { /* DONTALLOC */
+ if (standalone)
+ clt->compressedpages = pages;
+ standalone = false;
+ continue;
+ }
+
+ if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+ continue;
+
+ if (page)
+ put_page(page);
+ }
+
+ if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+ clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp)
+{
+ struct z_erofs_pcluster *const pcl =
+ container_of(grp, struct z_erofs_pcluster, obj);
+ struct address_space *const mapping = MNGD_MAPPING(sbi);
+ const unsigned int clusterpages = BIT(pcl->clusterbits);
+ int i;
+
+ /*
+ * refcount of workgroup is now freezed as 1,
+ * therefore no need to worry about available decompression users.
+ */
+ for (i = 0; i < clusterpages; ++i) {
+ struct page *page = pcl->compressed_pages[i];
+
+ if (!page)
+ continue;
+
+ /* block other users from reclaiming or migrating the page */
+ if (!trylock_page(page))
+ return -EBUSY;
+
+ if (page->mapping != mapping)
+ continue;
+
+ /* barrier is implied in the following 'unlock_page' */
+ WRITE_ONCE(pcl->compressed_pages[i], NULL);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+
+ unlock_page(page);
+ put_page(page);
+ }
+ return 0;
+}
+
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+ struct page *page)
+{
+ struct z_erofs_pcluster *const pcl = (void *)page_private(page);
+ const unsigned int clusterpages = BIT(pcl->clusterbits);
+ int ret = 0; /* 0 - busy */
+
+ if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
+ unsigned int i;
+
+ for (i = 0; i < clusterpages; ++i) {
+ if (pcl->compressed_pages[i] == page) {
+ WRITE_ONCE(pcl->compressed_pages[i], NULL);
+ ret = 1;
+ break;
+ }
+ }
+ erofs_workgroup_unfreeze(&pcl->obj, 1);
+
+ if (ret) {
+ ClearPagePrivate(page);
+ put_page(page);
+ }
+ }
+ return ret;
+}
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+ struct page *page)
+{
+ struct z_erofs_pcluster *const pcl = clt->pcl;
+ const unsigned int clusterpages = BIT(pcl->clusterbits);
+
+ while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
+ if (!cmpxchg(clt->compressedpages++, NULL, page))
+ return true;
+ }
+ return false;
+}
+
+/* callers must be with collection lock held */
+static int z_erofs_attach_page(struct z_erofs_collector *clt,
+ struct page *page,
+ enum z_erofs_page_type type)
+{
+ int ret;
+ bool occupied;
+
+ /* give priority for inplaceio */
+ if (clt->mode >= COLLECT_PRIMARY &&
+ type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+ z_erofs_try_inplace_io(clt, page))
+ return 0;
+
+ ret = z_erofs_pagevec_enqueue(&clt->vector,
+ page, type, &occupied);
+ clt->cl->vcnt += (unsigned int)ret;
+
+ return ret ? 0 : -EAGAIN;
+}
+
+static enum z_erofs_collectmode
+try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
+ z_erofs_next_pcluster_t *owned_head)
+{
+ /* let's claim these following types of pclusters */
+retry:
+ if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
+ /* type 1, nil pcluster */
+ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+ *owned_head) != Z_EROFS_PCLUSTER_NIL)
+ goto retry;
+
+ *owned_head = &pcl->next;
+ /* lucky, I am the followee :) */
+ return COLLECT_PRIMARY_FOLLOWED;
+ } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
+ /*
+ * type 2, link to the end of a existing open chain,
+ * be careful that its submission itself is governed
+ * by the original owned chain.
+ */
+ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+ *owned_head) != Z_EROFS_PCLUSTER_TAIL)
+ goto retry;
+ *owned_head = Z_EROFS_PCLUSTER_TAIL;
+ return COLLECT_PRIMARY_HOOKED;
+ }
+ return COLLECT_PRIMARY; /* :( better luck next time */
+}
+
+static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_workgroup *grp;
+ struct z_erofs_pcluster *pcl;
+ struct z_erofs_collection *cl;
+ unsigned int length;
+ bool tag;
+
+ grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
+ if (!grp)
+ return NULL;
+
+ pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+ DBG_BUGON(1);
+ erofs_workgroup_put(grp);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
+ cl = z_erofs_primarycollection(pcl);
+ if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
+ DBG_BUGON(1);
+ erofs_workgroup_put(grp);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
+ length = READ_ONCE(pcl->length);
+ if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
+ if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
+ DBG_BUGON(1);
+ erofs_workgroup_put(grp);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ } else {
+ unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
+
+ if (map->m_flags & EROFS_MAP_FULL_MAPPED)
+ llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
+
+ while (llen > length &&
+ length != cmpxchg_relaxed(&pcl->length, length, llen)) {
+ cpu_relax();
+ length = READ_ONCE(pcl->length);
+ }
+ }
+ mutex_lock(&cl->lock);
+ /* used to check tail merging loop due to corrupted images */
+ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ clt->tailpcl = pcl;
+ clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
+ /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
+ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ clt->tailpcl = NULL;
+ clt->pcl = pcl;
+ clt->cl = cl;
+ return cl;
+}
+
+static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct z_erofs_pcluster *pcl;
+ struct z_erofs_collection *cl;
+ int err;
+
+ /* no available workgroup, let's allocate one */
+ pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
+ if (!pcl)
+ return ERR_PTR(-ENOMEM);
+
+ z_erofs_pcluster_init_always(pcl);
+ pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+
+ pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
+ (map->m_flags & EROFS_MAP_FULL_MAPPED ?
+ Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+
+ if (map->m_flags & EROFS_MAP_ZIPPED)
+ pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
+ else
+ pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+
+ pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
+ pcl->clusterbits -= PAGE_SHIFT;
+
+ /* new pclusters should be claimed as type 1, primary and followed */
+ pcl->next = clt->owned_head;
+ clt->mode = COLLECT_PRIMARY_FOLLOWED;
+
+ cl = z_erofs_primarycollection(pcl);
+ cl->pageofs = map->m_la & ~PAGE_MASK;
+
+ /*
+ * lock all primary followed works before visible to others
+ * and mutex_trylock *never* fails for a new pcluster.
+ */
+ mutex_trylock(&cl->lock);
+
+ err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0);
+ if (err) {
+ mutex_unlock(&cl->lock);
+ kmem_cache_free(pcluster_cachep, pcl);
+ return ERR_PTR(-EAGAIN);
+ }
+ /* used to check tail merging loop due to corrupted images */
+ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ clt->tailpcl = pcl;
+ clt->owned_head = &pcl->next;
+ clt->pcl = pcl;
+ clt->cl = cl;
+ return cl;
+}
+
+static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct z_erofs_collection *cl;
+
+ DBG_BUGON(clt->cl);
+
+ /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
+ DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+ if (!PAGE_ALIGNED(map->m_pa)) {
+ DBG_BUGON(1);
+ return -EINVAL;
+ }
+
+repeat:
+ cl = cllookup(clt, inode, map);
+ if (!cl) {
+ cl = clregister(clt, inode, map);
+
+ if (cl == ERR_PTR(-EAGAIN))
+ goto repeat;
+ }
+
+ if (IS_ERR(cl))
+ return PTR_ERR(cl);
+
+ z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+ cl->pagevec, cl->vcnt);
+
+ clt->compressedpages = clt->pcl->compressed_pages;
+ if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
+ clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+ return 0;
+}
+
+/*
+ * keep in mind that no referenced pclusters will be freed
+ * only after a RCU grace period.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+ struct z_erofs_collection *const cl =
+ container_of(head, struct z_erofs_collection, rcu);
+
+ kmem_cache_free(pcluster_cachep,
+ container_of(cl, struct z_erofs_pcluster,
+ primary_collection));
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+ struct z_erofs_pcluster *const pcl =
+ container_of(grp, struct z_erofs_pcluster, obj);
+ struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
+
+ call_rcu(&cl->rcu, z_erofs_rcu_callback);
+}
+
+static void z_erofs_collection_put(struct z_erofs_collection *cl)
+{
+ struct z_erofs_pcluster *const pcl =
+ container_of(cl, struct z_erofs_pcluster, primary_collection);
+
+ erofs_workgroup_put(&pcl->obj);
+}
+
+static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+{
+ struct z_erofs_collection *cl = clt->cl;
+
+ if (!cl)
+ return false;
+
+ z_erofs_pagevec_ctor_exit(&clt->vector, false);
+ mutex_unlock(&cl->lock);
+
+ /*
+ * if all pending pages are added, don't hold its reference
+ * any longer if the pcluster isn't hosted by ourselves.
+ */
+ if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+ z_erofs_collection_put(cl);
+
+ clt->cl = NULL;
+ return true;
+}
+
+static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
+ gfp_t gfp)
+{
+ struct page *page = erofs_allocpage(pagepool, gfp, true);
+
+ page->mapping = Z_EROFS_MAPPING_STAGING;
+ return page;
+}
+
+static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
+ unsigned int cachestrategy,
+ erofs_off_t la)
+{
+ if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+ return false;
+
+ if (fe->backmost)
+ return true;
+
+ return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+ la < fe->headoffset;
+}
+
+static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
+ struct page *page,
+ struct list_head *pagepool)
+{
+ struct inode *const inode = fe->inode;
+ struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode);
+ struct erofs_map_blocks *const map = &fe->map;
+ struct z_erofs_collector *const clt = &fe->clt;
+ const loff_t offset = page_offset(page);
+ bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED);
+
+ enum z_erofs_cache_alloctype cache_strategy;
+ enum z_erofs_page_type page_type;
+ unsigned int cur, end, spiltted, index;
+ int err = 0;
+
+ /* register locked file pages as online pages in pack */
+ z_erofs_onlinepage_init(page);
+
+ spiltted = 0;
+ end = PAGE_SIZE;
+repeat:
+ cur = end - 1;
+
+ /* lucky, within the range of the current map_blocks */
+ if (offset + cur >= map->m_la &&
+ offset + cur < map->m_la + map->m_llen) {
+ /* didn't get a valid collection previously (very rare) */
+ if (!clt->cl)
+ goto restart_now;
+ goto hitted;
+ }
+
+ /* go ahead the next map_blocks */
+ erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+ if (z_erofs_collector_end(clt))
+ fe->backmost = false;
+
+ map->m_la = offset + cur;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ goto err_out;
+
+restart_now:
+ if (!(map->m_flags & EROFS_MAP_MAPPED))
+ goto hitted;
+
+ err = z_erofs_collector_begin(clt, inode, map);
+ if (err)
+ goto err_out;
+
+ /* preload all compressed pages (maybe downgrade role if necessary) */
+ if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
+ cache_strategy = DELAYEDALLOC;
+ else
+ cache_strategy = DONTALLOC;
+
+ preload_compressed_pages(clt, MNGD_MAPPING(sbi),
+ cache_strategy, pagepool);
+
+ tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED);
+hitted:
+ cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+ if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ zero_user_segment(page, cur, end);
+ goto next_part;
+ }
+
+ /* let's derive page type */
+ page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+ (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+ (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+ Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+ if (cur)
+ tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+
+retry:
+ err = z_erofs_attach_page(clt, page, page_type);
+ /* should allocate an additional staging page for pagevec */
+ if (err == -EAGAIN) {
+ struct page *const newpage =
+ __stagingpage_alloc(pagepool, GFP_NOFS);
+
+ err = z_erofs_attach_page(clt, newpage,
+ Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+ if (!err)
+ goto retry;
+ }
+
+ if (err)
+ goto err_out;
+
+ index = page->index - (map->m_la >> PAGE_SHIFT);
+
+ z_erofs_onlinepage_fixup(page, index, true);
+
+ /* bump up the number of spiltted parts of a page */
+ ++spiltted;
+ /* also update nr_pages */
+ clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+next_part:
+ /* can be used for verification */
+ map->m_llen = offset + cur - map->m_la;
+
+ end = cur;
+ if (end > 0)
+ goto repeat;
+
+out:
+ z_erofs_onlinepage_endio(page);
+
+ erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+ __func__, page, spiltted, map->m_llen);
+ return err;
+
+ /* if some error occurred while processing this page */
+err_out:
+ SetPageError(page);
+ goto out;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+ tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+ struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t);
+ bool background = tagptr_unfold_tags(t);
+
+ if (!background) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&io->u.wait.lock, flags);
+ if (!atomic_add_return(bios, &io->pending_bios))
+ wake_up_locked(&io->u.wait);
+ spin_unlock_irqrestore(&io->u.wait.lock, flags);
+ return;
+ }
+
+ if (!atomic_add_return(bios, &io->pending_bios))
+ queue_work(z_erofs_workqueue, &io->u.work);
+}
+
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+{
+ struct erofs_sb_info *sbi = NULL;
+ blk_status_t err = bio->bi_status;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+ bool cachemngd = false;
+
+ DBG_BUGON(PageUptodate(page));
+ DBG_BUGON(!page->mapping);
+
+ if (!sbi && !z_erofs_page_is_staging(page))
+ sbi = EROFS_SB(page->mapping->host->i_sb);
+
+ /* sbi should already be gotten if the page is managed */
+ if (sbi)
+ cachemngd = erofs_page_is_managed(sbi, page);
+
+ if (err)
+ SetPageError(page);
+ else if (cachemngd)
+ SetPageUptodate(page);
+
+ if (cachemngd)
+ unlock_page(page);
+ }
+
+ z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+ bio_put(bio);
+}
+
+static int z_erofs_decompress_pcluster(struct super_block *sb,
+ struct z_erofs_pcluster *pcl,
+ struct list_head *pagepool)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ const unsigned int clusterpages = BIT(pcl->clusterbits);
+ struct z_erofs_pagevec_ctor ctor;
+ unsigned int i, outputsize, llen, nr_pages;
+ struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
+ struct page **pages, **compressed_pages, *page;
+
+ enum z_erofs_page_type page_type;
+ bool overlapped, partial;
+ struct z_erofs_collection *cl;
+ int err;
+
+ might_sleep();
+ cl = z_erofs_primarycollection(pcl);
+ DBG_BUGON(!READ_ONCE(cl->nr_pages));
+
+ mutex_lock(&cl->lock);
+ nr_pages = cl->nr_pages;
+
+ if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
+ pages = pages_onstack;
+ } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
+ mutex_trylock(&z_pagemap_global_lock)) {
+ pages = z_pagemap_global;
+ } else {
+ gfp_t gfp_flags = GFP_KERNEL;
+
+ if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
+ gfp_flags |= __GFP_NOFAIL;
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *),
+ gfp_flags);
+
+ /* fallback to global pagemap for the lowmem scenario */
+ if (!pages) {
+ mutex_lock(&z_pagemap_global_lock);
+ pages = z_pagemap_global;
+ }
+ }
+
+ for (i = 0; i < nr_pages; ++i)
+ pages[i] = NULL;
+
+ err = 0;
+ z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
+ cl->pagevec, 0);
+
+ for (i = 0; i < cl->vcnt; ++i) {
+ unsigned int pagenr;
+
+ page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+
+ /* all pages in pagevec ought to be valid */
+ DBG_BUGON(!page);
+ DBG_BUGON(!page->mapping);
+
+ if (z_erofs_put_stagingpage(pagepool, page))
+ continue;
+
+ if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+ pagenr = 0;
+ else
+ pagenr = z_erofs_onlinepage_index(page);
+
+ DBG_BUGON(pagenr >= nr_pages);
+
+ /*
+ * currently EROFS doesn't support multiref(dedup),
+ * so here erroring out one multiref page.
+ */
+ if (pages[pagenr]) {
+ DBG_BUGON(1);
+ SetPageError(pages[pagenr]);
+ z_erofs_onlinepage_endio(pages[pagenr]);
+ err = -EFSCORRUPTED;
+ }
+ pages[pagenr] = page;
+ }
+ z_erofs_pagevec_ctor_exit(&ctor, true);
+
+ overlapped = false;
+ compressed_pages = pcl->compressed_pages;
+
+ for (i = 0; i < clusterpages; ++i) {
+ unsigned int pagenr;
+
+ page = compressed_pages[i];
+
+ /* all compressed pages ought to be valid */
+ DBG_BUGON(!page);
+ DBG_BUGON(!page->mapping);
+
+ if (!z_erofs_page_is_staging(page)) {
+ if (erofs_page_is_managed(sbi, page)) {
+ if (!PageUptodate(page))
+ err = -EIO;
+ continue;
+ }
+
+ /*
+ * only if non-head page can be selected
+ * for inplace decompression
+ */
+ pagenr = z_erofs_onlinepage_index(page);
+
+ DBG_BUGON(pagenr >= nr_pages);
+ if (pages[pagenr]) {
+ DBG_BUGON(1);
+ SetPageError(pages[pagenr]);
+ z_erofs_onlinepage_endio(pages[pagenr]);
+ err = -EFSCORRUPTED;
+ }
+ pages[pagenr] = page;
+
+ overlapped = true;
+ }
+
+ /* PG_error needs checking for inplaced and staging pages */
+ if (PageError(page)) {
+ DBG_BUGON(PageUptodate(page));
+ err = -EIO;
+ }
+ }
+
+ if (err)
+ goto out;
+
+ llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
+ if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
+ outputsize = llen;
+ partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
+ } else {
+ outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
+ partial = true;
+ }
+
+ err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+ .sb = sb,
+ .in = compressed_pages,
+ .out = pages,
+ .pageofs_out = cl->pageofs,
+ .inputsize = PAGE_SIZE,
+ .outputsize = outputsize,
+ .alg = pcl->algorithmformat,
+ .inplace_io = overlapped,
+ .partial_decoding = partial
+ }, pagepool);
+
+out:
+ /* must handle all compressed pages before endding pages */
+ for (i = 0; i < clusterpages; ++i) {
+ page = compressed_pages[i];
+
+ if (erofs_page_is_managed(sbi, page))
+ continue;
+
+ /* recycle all individual staging pages */
+ (void)z_erofs_put_stagingpage(pagepool, page);
+
+ WRITE_ONCE(compressed_pages[i], NULL);
+ }
+
+ for (i = 0; i < nr_pages; ++i) {
+ page = pages[i];
+ if (!page)
+ continue;
+
+ DBG_BUGON(!page->mapping);
+
+ /* recycle all individual staging pages */
+ if (z_erofs_put_stagingpage(pagepool, page))
+ continue;
+
+ if (err < 0)
+ SetPageError(page);
+
+ z_erofs_onlinepage_endio(page);
+ }
+
+ if (pages == z_pagemap_global)
+ mutex_unlock(&z_pagemap_global_lock);
+ else if (pages != pages_onstack)
+ kvfree(pages);
+
+ cl->nr_pages = 0;
+ cl->vcnt = 0;
+
+ /* all cl locks MUST be taken before the following line */
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+
+ /* all cl locks SHOULD be released right now */
+ mutex_unlock(&cl->lock);
+
+ z_erofs_collection_put(cl);
+ return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+ struct z_erofs_unzip_io *io,
+ struct list_head *pagepool)
+{
+ z_erofs_next_pcluster_t owned = io->head;
+
+ while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
+ struct z_erofs_pcluster *pcl;
+
+ /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
+
+ /* no possible that 'owned' equals NULL */
+ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
+
+ pcl = container_of(owned, struct z_erofs_pcluster, next);
+ owned = READ_ONCE(pcl->next);
+
+ z_erofs_decompress_pcluster(sb, pcl, pagepool);
+ }
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+ struct z_erofs_unzip_io_sb *iosb =
+ container_of(work, struct z_erofs_unzip_io_sb, io.u.work);
+ LIST_HEAD(pagepool);
+
+ DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool);
+
+ put_pages_list(&pagepool);
+ kvfree(iosb);
+}
+
+static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
+ unsigned int nr,
+ struct list_head *pagepool,
+ struct address_space *mc,
+ gfp_t gfp)
+{
+ /* determined at compile time to avoid too many #ifdefs */
+ const bool nocache = __builtin_constant_p(mc) ? !mc : false;
+ const pgoff_t index = pcl->obj.index;
+ bool tocache = false;
+
+ struct address_space *mapping;
+ struct page *oldpage, *page;
+
+ compressed_page_t t;
+ int justfound;
+
+repeat:
+ page = READ_ONCE(pcl->compressed_pages[nr]);
+ oldpage = page;
+
+ if (!page)
+ goto out_allocpage;
+
+ /*
+ * the cached page has not been allocated and
+ * an placeholder is out there, prepare it now.
+ */
+ if (!nocache && page == PAGE_UNALLOCATED) {
+ tocache = true;
+ goto out_allocpage;
+ }
+
+ /* process the target tagged pointer */
+ t = tagptr_init(compressed_page_t, page);
+ justfound = tagptr_unfold_tags(t);
+ page = tagptr_unfold_ptr(t);
+
+ mapping = READ_ONCE(page->mapping);
+
+ /*
+ * if managed cache is disabled, it's no way to
+ * get such a cached-like page.
+ */
+ if (nocache) {
+ /* if managed cache is disabled, it is impossible `justfound' */
+ DBG_BUGON(justfound);
+
+ /* and it should be locked, not uptodate, and not truncated */
+ DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(PageUptodate(page));
+ DBG_BUGON(!mapping);
+ goto out;
+ }
+
+ /*
+ * unmanaged (file) pages are all locked solidly,
+ * therefore it is impossible for `mapping' to be NULL.
+ */
+ if (mapping && mapping != mc)
+ /* ought to be unmanaged pages */
+ goto out;
+
+ lock_page(page);
+
+ /* only true if page reclaim goes wrong, should never happen */
+ DBG_BUGON(justfound && PagePrivate(page));
+
+ /* the page is still in manage cache */
+ if (page->mapping == mc) {
+ WRITE_ONCE(pcl->compressed_pages[nr], page);
+
+ ClearPageError(page);
+ if (!PagePrivate(page)) {
+ /*
+ * impossible to be !PagePrivate(page) for
+ * the current restriction as well if
+ * the page is already in compressed_pages[].
+ */
+ DBG_BUGON(!justfound);
+
+ justfound = 0;
+ set_page_private(page, (unsigned long)pcl);
+ SetPagePrivate(page);
+ }
+
+ /* no need to submit io if it is already up-to-date */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ page = NULL;
+ }
+ goto out;
+ }
+
+ /*
+ * the managed page has been truncated, it's unsafe to
+ * reuse this one, let's allocate a new cache-managed page.
+ */
+ DBG_BUGON(page->mapping);
+ DBG_BUGON(!justfound);
+
+ tocache = true;
+ unlock_page(page);
+ put_page(page);
+out_allocpage:
+ page = __stagingpage_alloc(pagepool, gfp);
+ if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+ list_add(&page->lru, pagepool);
+ cpu_relax();
+ goto repeat;
+ }
+ if (nocache || !tocache)
+ goto out;
+ if (add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ page->mapping = Z_EROFS_MAPPING_STAGING;
+ goto out;
+ }
+
+ set_page_private(page, (unsigned long)pcl);
+ SetPagePrivate(page);
+out: /* the only exit (for tracing and debugging) */
+ return page;
+}
+
+static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb,
+ struct z_erofs_unzip_io *io,
+ bool foreground)
+{
+ struct z_erofs_unzip_io_sb *iosb;
+
+ if (foreground) {
+ /* waitqueue available for foreground io */
+ DBG_BUGON(!io);
+
+ init_waitqueue_head(&io->u.wait);
+ atomic_set(&io->pending_bios, 0);
+ goto out;
+ }
+
+ iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL);
+ DBG_BUGON(!iosb);
+
+ /* initialize fields in the allocated descriptor */
+ io = &iosb->io;
+ iosb->sb = sb;
+ INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+ io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ return io;
+}
+
+/* define decompression jobqueue types */
+enum {
+ JQ_BYPASS,
+ JQ_SUBMIT,
+ NR_JOBQUEUES,
+};
+
+static void *jobqueueset_init(struct super_block *sb,
+ z_erofs_next_pcluster_t qtail[],
+ struct z_erofs_unzip_io *q[],
+ struct z_erofs_unzip_io *fgq,
+ bool forcefg)
+{
+ /*
+ * if managed cache is enabled, bypass jobqueue is needed,
+ * no need to read from device for all pclusters in this queue.
+ */
+ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true);
+ qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+
+ q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg);
+ qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+
+ return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg));
+}
+
+static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
+ z_erofs_next_pcluster_t qtail[],
+ z_erofs_next_pcluster_t owned_head)
+{
+ z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
+ z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
+
+ DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ if (owned_head == Z_EROFS_PCLUSTER_TAIL)
+ owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+ WRITE_ONCE(*submit_qtail, owned_head);
+ WRITE_ONCE(*bypass_qtail, &pcl->next);
+
+ qtail[JQ_BYPASS] = &pcl->next;
+}
+
+static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
+ unsigned int nr_bios,
+ bool force_fg)
+{
+ /*
+ * although background is preferred, no one is pending for submission.
+ * don't issue workqueue for decompression but drop it directly instead.
+ */
+ if (force_fg || nr_bios)
+ return false;
+
+ kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io));
+ return true;
+}
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+ z_erofs_next_pcluster_t owned_head,
+ struct list_head *pagepool,
+ struct z_erofs_unzip_io *fgq,
+ bool force_fg)
+{
+ struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+ z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+ struct z_erofs_unzip_io *q[NR_JOBQUEUES];
+ struct bio *bio;
+ void *bi_private;
+ /* since bio will be NULL, no need to initialize last_index */
+ pgoff_t uninitialized_var(last_index);
+ bool force_submit = false;
+ unsigned int nr_bios;
+
+ if (owned_head == Z_EROFS_PCLUSTER_TAIL)
+ return false;
+
+ force_submit = false;
+ bio = NULL;
+ nr_bios = 0;
+ bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg);
+
+ /* by default, all need io submission */
+ q[JQ_SUBMIT]->head = owned_head;
+
+ do {
+ struct z_erofs_pcluster *pcl;
+ unsigned int clusterpages;
+ pgoff_t first_index;
+ struct page *page;
+ unsigned int i = 0, bypass = 0;
+ int err;
+
+ /* no possible 'owned_head' equals the following */
+ DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
+
+ pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+
+ clusterpages = BIT(pcl->clusterbits);
+
+ /* close the main owned chain at first */
+ owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+ Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+ first_index = pcl->obj.index;
+ force_submit |= (first_index != last_index + 1);
+
+repeat:
+ page = pickup_page_for_submission(pcl, i, pagepool,
+ MNGD_MAPPING(sbi),
+ GFP_NOFS);
+ if (!page) {
+ force_submit = true;
+ ++bypass;
+ goto skippage;
+ }
+
+ if (bio && force_submit) {
+submit_bio_retry:
+ submit_bio(bio);
+ bio = NULL;
+ }
+
+ if (!bio) {
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+
+ bio->bi_end_io = z_erofs_vle_read_endio;
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_iter.bi_sector = (sector_t)(first_index + i) <<
+ LOG_SECTORS_PER_BLOCK;
+ bio->bi_private = bi_private;
+ bio->bi_opf = REQ_OP_READ;
+
+ ++nr_bios;
+ }
+
+ err = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (err < PAGE_SIZE)
+ goto submit_bio_retry;
+
+ force_submit = false;
+ last_index = first_index + i;
+skippage:
+ if (++i < clusterpages)
+ goto repeat;
+
+ if (bypass < clusterpages)
+ qtail[JQ_SUBMIT] = &pcl->next;
+ else
+ move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+
+ if (bio)
+ submit_bio(bio);
+
+ if (postsubmit_is_all_bypassed(q, nr_bios, force_fg))
+ return true;
+
+ z_erofs_vle_unzip_kickoff(bi_private, nr_bios);
+ return true;
+}
+
+static void z_erofs_submit_and_unzip(struct super_block *sb,
+ struct z_erofs_collector *clt,
+ struct list_head *pagepool,
+ bool force_fg)
+{
+ struct z_erofs_unzip_io io[NR_JOBQUEUES];
+
+ if (!z_erofs_vle_submit_all(sb, clt->owned_head,
+ pagepool, io, force_fg))
+ return;
+
+ /* decompress no I/O pclusters immediately */
+ z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool);
+
+ if (!force_fg)
+ return;
+
+ /* wait until all bios are completed */
+ wait_event(io[JQ_SUBMIT].u.wait,
+ !atomic_read(&io[JQ_SUBMIT].pending_bios));
+
+ /* let's synchronous decompression */
+ z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+ struct page *page)
+{
+ struct inode *const inode = page->mapping->host;
+ struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ int err;
+ LIST_HEAD(pagepool);
+
+ trace_erofs_readpage(page, false);
+
+ f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+
+ err = z_erofs_do_read_page(&f, page, &pagepool);
+ (void)z_erofs_collector_end(&f.clt);
+
+ /* if some compressed cluster ready, need submit them anyway */
+ z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true);
+
+ if (err)
+ erofs_err(inode->i_sb, "failed to read, err [%d]", err);
+
+ if (f.map.mpage)
+ put_page(f.map.mpage);
+
+ /* clean up the remaining free pages */
+ put_pages_list(&pagepool);
+ return err;
+}
+
+static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
+ unsigned int nr)
+{
+ return nr <= sbi->max_sync_decompress_pages;
+}
+
+static int z_erofs_vle_normalaccess_readpages(struct file *filp,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned int nr_pages)
+{
+ struct inode *const inode = mapping->host;
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+
+ bool sync = should_decompress_synchronously(sbi, nr_pages);
+ struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+ struct page *head = NULL;
+ LIST_HEAD(pagepool);
+
+ trace_erofs_readpages(mapping->host, lru_to_page(pages),
+ nr_pages, false);
+
+ f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
+
+ for (; nr_pages; --nr_pages) {
+ struct page *page = lru_to_page(pages);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+
+ /*
+ * A pure asynchronous readahead is indicated if
+ * a PG_readahead marked page is hitted at first.
+ * Let's also do asynchronous decompression for this case.
+ */
+ sync &= !(PageReadahead(page) && !head);
+
+ if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+ list_add(&page->lru, &pagepool);
+ continue;
+ }
+
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ while (head) {
+ struct page *page = head;
+ int err;
+
+ /* traversal in reverse order */
+ head = (void *)page_private(page);
+
+ err = z_erofs_do_read_page(&f, page, &pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readahead error at page %lu @ nid %llu",
+ page->index, EROFS_I(inode)->nid);
+ put_page(page);
+ }
+
+ (void)z_erofs_collector_end(&f.clt);
+
+ z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync);
+
+ if (f.map.mpage)
+ put_page(f.map.mpage);
+
+ /* clean up the remaining free pages */
+ put_pages_list(&pagepool);
+ return 0;
+}
+
+const struct address_space_operations z_erofs_vle_normalaccess_aops = {
+ .readpage = z_erofs_vle_normalaccess_readpage,
+ .readpages = z_erofs_vle_normalaccess_readpages,
+};
+
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
new file mode 100644
index 000000000000..faf950189bd7
--- /dev/null
+++ b/fs/erofs/zdata.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_ZDATA_H
+#define __EROFS_FS_ZDATA_H
+
+#include "internal.h"
+#include "zpvec.h"
+
+#define Z_EROFS_NR_INLINE_PAGEVECS 3
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ * for everyone else;
+ *
+ * L: Field should be protected by pageset lock;
+ *
+ * A: Field should be accessed / updated in atomic for parallelized code.
+ */
+struct z_erofs_collection {
+ struct mutex lock;
+
+ /* I: page offset of start position of decompression */
+ unsigned short pageofs;
+
+ /* L: maximum relative page index in pagevec[] */
+ unsigned short nr_pages;
+
+ /* L: total number of pages in pagevec[] */
+ unsigned int vcnt;
+
+ union {
+ /* L: inline a certain number of pagevecs for bootstrap */
+ erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+
+ /* I: can be used to free the pcluster by RCU. */
+ struct rcu_head rcu;
+ };
+};
+
+#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
+#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
+struct z_erofs_pcluster {
+ struct erofs_workgroup obj;
+ struct z_erofs_collection primary_collection;
+
+ /* A: point to next chained pcluster or TAILs */
+ z_erofs_next_pcluster_t next;
+
+ /* A: compressed pages (including multi-usage pages) */
+ struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+
+ /* A: lower limit of decompressed length and if full length or not */
+ unsigned int length;
+
+ /* I: compression algorithm format */
+ unsigned char algorithmformat;
+ /* I: bit shift of physical cluster size */
+ unsigned char clusterbits;
+};
+
+#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_PCLUSTER_NIL (NULL)
+
+#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster)
+
+struct z_erofs_unzip_io {
+ atomic_t pending_bios;
+ z_erofs_next_pcluster_t head;
+
+ union {
+ wait_queue_head_t wait;
+ struct work_struct work;
+ } u;
+};
+
+struct z_erofs_unzip_io_sb {
+ struct z_erofs_unzip_io io;
+ struct super_block *sb;
+};
+
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+ z_erofs_onlinepage_t *o;
+ unsigned long *v;
+};
+
+static inline unsigned int z_erofs_onlinepage_index(struct page *page)
+{
+ union z_erofs_onlinepage_converter u;
+
+ DBG_BUGON(!PagePrivate(page));
+ u.v = &page_private(page);
+
+ return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+ union {
+ z_erofs_onlinepage_t o;
+ unsigned long v;
+ /* keep from being unlocked in advance */
+ } u = { .o = ATOMIC_INIT(1) };
+
+ set_page_private(page, u.v);
+ smp_wmb();
+ SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+ uintptr_t index, bool down)
+{
+ unsigned long *p, o, v, id;
+repeat:
+ p = &page_private(page);
+ o = READ_ONCE(*p);
+
+ id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+ if (id) {
+ if (!index)
+ return;
+
+ DBG_BUGON(id != index);
+ }
+
+ v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+ ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+ if (cmpxchg(p, o, v) != o)
+ goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+ union z_erofs_onlinepage_converter u;
+ unsigned int v;
+
+ DBG_BUGON(!PagePrivate(page));
+ u.v = &page_private(page);
+
+ v = atomic_dec_return(u.o);
+ if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+ ClearPagePrivate(page);
+ if (!PageError(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VMAP_ONSTACK_PAGES \
+ min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
+#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
+
+#endif
+
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
new file mode 100644
index 000000000000..6a26c293ae2d
--- /dev/null
+++ b/fs/erofs/zmap.c
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018-2019 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <asm/unaligned.h>
+#include <trace/events/erofs.h>
+
+int z_erofs_fill_inode(struct inode *inode)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+
+ if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+ vi->z_advise = 0;
+ vi->z_algorithmtype[0] = 0;
+ vi->z_algorithmtype[1] = 0;
+ vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
+ vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
+ vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
+ set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+ }
+
+ inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops;
+ return 0;
+}
+
+static int fill_inode_lazy(struct inode *inode)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = inode->i_sb;
+ int err;
+ erofs_off_t pos;
+ struct page *page;
+ void *kaddr;
+ struct z_erofs_map_header *h;
+
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ return 0;
+
+ if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
+ return -ERESTARTSYS;
+
+ err = 0;
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ goto out_unlock;
+
+ DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+
+ pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+ vi->xattr_isize, 8);
+ page = erofs_get_meta_page(sb, erofs_blknr(pos));
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out_unlock;
+ }
+
+ kaddr = kmap_atomic(page);
+
+ h = kaddr + erofs_blkoff(pos);
+ vi->z_advise = le16_to_cpu(h->h_advise);
+ vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+ vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
+ vi->z_algorithmtype[0], vi->nid);
+ err = -EOPNOTSUPP;
+ goto unmap_done;
+ }
+
+ vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+ vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
+ ((h->h_clusterbits >> 3) & 3);
+
+ if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
+ erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
+ vi->z_physical_clusterbits[0], vi->nid);
+ err = -EOPNOTSUPP;
+ goto unmap_done;
+ }
+
+ vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
+ ((h->h_clusterbits >> 5) & 7);
+ set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+unmap_done:
+ kunmap_atomic(kaddr);
+ unlock_page(page);
+ put_page(page);
+out_unlock:
+ clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
+ return err;
+}
+
+struct z_erofs_maprecorder {
+ struct inode *inode;
+ struct erofs_map_blocks *map;
+ void *kaddr;
+
+ unsigned long lcn;
+ /* compression extent information gathered */
+ u8 type;
+ u16 clusterofs;
+ u16 delta[2];
+ erofs_blk_t pblk;
+};
+
+static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
+ erofs_blk_t eblk)
+{
+ struct super_block *const sb = m->inode->i_sb;
+ struct erofs_map_blocks *const map = m->map;
+ struct page *mpage = map->mpage;
+
+ if (mpage) {
+ if (mpage->index == eblk) {
+ if (!m->kaddr)
+ m->kaddr = kmap_atomic(mpage);
+ return 0;
+ }
+
+ if (m->kaddr) {
+ kunmap_atomic(m->kaddr);
+ m->kaddr = NULL;
+ }
+ put_page(mpage);
+ }
+
+ mpage = erofs_get_meta_page(sb, eblk);
+ if (IS_ERR(mpage)) {
+ map->mpage = NULL;
+ return PTR_ERR(mpage);
+ }
+ m->kaddr = kmap_atomic(mpage);
+ unlock_page(mpage);
+ map->mpage = mpage;
+ return 0;
+}
+
+static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
+{
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid);
+ const erofs_off_t pos =
+ Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize +
+ vi->xattr_isize) +
+ lcn * sizeof(struct z_erofs_vle_decompressed_index);
+ struct z_erofs_vle_decompressed_index *di;
+ unsigned int advise, type;
+ int err;
+
+ err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+ if (err)
+ return err;
+
+ m->lcn = lcn;
+ di = m->kaddr + erofs_blkoff(pos);
+
+ advise = le16_to_cpu(di->di_advise);
+ type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
+ ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1);
+ switch (type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+ m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
+ break;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ m->clusterofs = le16_to_cpu(di->di_clusterofs);
+ m->pblk = le32_to_cpu(di->di_u.blkaddr);
+ break;
+ default:
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ m->type = type;
+ return 0;
+}
+
+static unsigned int decode_compactedbits(unsigned int lobits,
+ unsigned int lomask,
+ u8 *in, unsigned int pos, u8 *type)
+{
+ const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
+ const unsigned int lo = v & lomask;
+
+ *type = (v >> lobits) & 3;
+ return lo;
+}
+
+static int unpack_compacted_index(struct z_erofs_maprecorder *m,
+ unsigned int amortizedshift,
+ unsigned int eofs)
+{
+ struct erofs_inode *const vi = EROFS_I(m->inode);
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const unsigned int lomask = (1 << lclusterbits) - 1;
+ unsigned int vcnt, base, lo, encodebits, nblk;
+ int i;
+ u8 *in, type;
+
+ if (1 << amortizedshift == 4)
+ vcnt = 2;
+ else if (1 << amortizedshift == 2 && lclusterbits == 12)
+ vcnt = 16;
+ else
+ return -EOPNOTSUPP;
+
+ encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+ base = round_down(eofs, vcnt << amortizedshift);
+ in = m->kaddr + base;
+
+ i = (eofs - base) >> amortizedshift;
+
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ m->type = type;
+ if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+ m->clusterofs = 1 << lclusterbits;
+ if (i + 1 != vcnt) {
+ m->delta[0] = lo;
+ return 0;
+ }
+ /*
+ * since the last lcluster in the pack is special,
+ * of which lo saves delta[1] rather than delta[0].
+ * Hence, get delta[0] by the previous lcluster indirectly.
+ */
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * (i - 1), &type);
+ if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ lo = 0;
+ m->delta[0] = lo + 1;
+ return 0;
+ }
+ m->clusterofs = lo;
+ m->delta[0] = 0;
+ /* figout out blkaddr (pblk) for HEAD lclusters */
+ nblk = 1;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ i -= lo;
+
+ if (i >= 0)
+ ++nblk;
+ }
+ in += (vcnt << amortizedshift) - sizeof(__le32);
+ m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
+ return 0;
+}
+
+static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
+{
+ struct inode *const inode = m->inode;
+ struct erofs_inode *const vi = EROFS_I(inode);
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
+ vi->inode_isize + vi->xattr_isize, 8) +
+ sizeof(struct z_erofs_map_header);
+ const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+ unsigned int compacted_4b_initial, compacted_2b;
+ unsigned int amortizedshift;
+ erofs_off_t pos;
+ int err;
+
+ if (lclusterbits != 12)
+ return -EOPNOTSUPP;
+
+ if (lcn >= totalidx)
+ return -EINVAL;
+
+ m->lcn = lcn;
+ /* used to align to 32-byte (compacted_2b) alignment */
+ compacted_4b_initial = (32 - ebase % 32) / 4;
+ if (compacted_4b_initial == 32 / 4)
+ compacted_4b_initial = 0;
+
+ if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
+ compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+ else
+ compacted_2b = 0;
+
+ pos = ebase;
+ if (lcn < compacted_4b_initial) {
+ amortizedshift = 2;
+ goto out;
+ }
+ pos += compacted_4b_initial * 4;
+ lcn -= compacted_4b_initial;
+
+ if (lcn < compacted_2b) {
+ amortizedshift = 1;
+ goto out;
+ }
+ pos += compacted_2b * 2;
+ lcn -= compacted_2b;
+ amortizedshift = 2;
+out:
+ pos += lcn * (1 << amortizedshift);
+ err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+ if (err)
+ return err;
+ return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
+}
+
+static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned int lcn)
+{
+ const unsigned int datamode = EROFS_I(m->inode)->datalayout;
+
+ if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+ return vle_legacy_load_cluster_from_disk(m, lcn);
+
+ if (datamode == EROFS_INODE_FLAT_COMPRESSION)
+ return compacted_load_cluster_from_disk(m, lcn);
+
+ return -EINVAL;
+}
+
+static int vle_extent_lookback(struct z_erofs_maprecorder *m,
+ unsigned int lookback_distance)
+{
+ struct erofs_inode *const vi = EROFS_I(m->inode);
+ struct erofs_map_blocks *const map = m->map;
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned long lcn = m->lcn;
+ int err;
+
+ if (lcn < lookback_distance) {
+ erofs_err(m->inode->i_sb,
+ "bogus lookback distance @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
+ /* load extent head logical cluster if needed */
+ lcn -= lookback_distance;
+ err = vle_load_cluster_from_disk(m, lcn);
+ if (err)
+ return err;
+
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (!m->delta[0]) {
+ erofs_err(m->inode->i_sb,
+ "invalid lookback distance 0 @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ return vle_extent_lookback(m, m->delta[0]);
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ map->m_flags &= ~EROFS_MAP_ZIPPED;
+ /* fallthrough */
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ break;
+ default:
+ erofs_err(m->inode->i_sb,
+ "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct z_erofs_maprecorder m = {
+ .inode = inode,
+ .map = map,
+ };
+ int err = 0;
+ unsigned int lclusterbits, endoff;
+ unsigned long long ofs, end;
+
+ trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+
+ /* when trying to read beyond EOF, leave it unmapped */
+ if (map->m_la >= inode->i_size) {
+ map->m_llen = map->m_la + 1 - inode->i_size;
+ map->m_la = inode->i_size;
+ map->m_flags = 0;
+ goto out;
+ }
+
+ err = fill_inode_lazy(inode);
+ if (err)
+ goto out;
+
+ lclusterbits = vi->z_logical_clusterbits;
+ ofs = map->m_la;
+ m.lcn = ofs >> lclusterbits;
+ endoff = ofs & ((1 << lclusterbits) - 1);
+
+ err = vle_load_cluster_from_disk(&m, m.lcn);
+ if (err)
+ goto unmap_out;
+
+ map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
+ end = (m.lcn + 1ULL) << lclusterbits;
+
+ switch (m.type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ if (endoff >= m.clusterofs)
+ map->m_flags &= ~EROFS_MAP_ZIPPED;
+ /* fallthrough */
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ if (endoff >= m.clusterofs) {
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ break;
+ }
+ /* m.lcn should be >= 1 if endoff < m.clusterofs */
+ if (!m.lcn) {
+ erofs_err(inode->i_sb,
+ "invalid logical cluster 0 at nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ end = (m.lcn << lclusterbits) | m.clusterofs;
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ m.delta[0] = 1;
+ /* fallthrough */
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ /* get the correspoinding first chunk */
+ err = vle_extent_lookback(&m, m.delta[0]);
+ if (err)
+ goto unmap_out;
+ break;
+ default:
+ erofs_err(inode->i_sb,
+ "unknown type %u @ offset %llu of nid %llu",
+ m.type, ofs, vi->nid);
+ err = -EOPNOTSUPP;
+ goto unmap_out;
+ }
+
+ map->m_llen = end - map->m_la;
+ map->m_plen = 1 << lclusterbits;
+ map->m_pa = blknr_to_addr(m.pblk);
+ map->m_flags |= EROFS_MAP_MAPPED;
+
+unmap_out:
+ if (m.kaddr)
+ kunmap_atomic(m.kaddr);
+
+out:
+ erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+ __func__, map->m_la, map->m_pa,
+ map->m_llen, map->m_plen, map->m_flags);
+
+ trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+
+ /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
+ DBG_BUGON(err < 0 && err != -ENOMEM);
+ return err;
+}
+
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
new file mode 100644
index 000000000000..58556903aa94
--- /dev/null
+++ b/fs/erofs/zpvec.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ * http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_ZPVEC_H
+#define __EROFS_FS_ZPVEC_H
+
+#include "tagptr.h"
+
+/* page type in pagevec for decompress subsystem */
+enum z_erofs_page_type {
+ /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+ Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+ Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+ Z_EROFS_VLE_PAGE_TYPE_HEAD,
+ Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+ __bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+ struct page *curr, *next;
+ erofs_vtptr_t *pages;
+
+ unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+ bool atomic)
+{
+ if (!ctor->curr)
+ return;
+
+ if (atomic)
+ kunmap_atomic(ctor->pages);
+ else
+ kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+ unsigned int nr)
+{
+ unsigned int index;
+
+ /* keep away from occupied pages */
+ if (ctor->next)
+ return ctor->next;
+
+ for (index = 0; index < nr; ++index) {
+ const erofs_vtptr_t t = ctor->pages[index];
+ const unsigned int tags = tagptr_unfold_tags(t);
+
+ if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+ return tagptr_unfold_ptr(t);
+ }
+ DBG_BUGON(nr >= ctor->nr);
+ return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+ bool atomic)
+{
+ struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+ z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+ ctor->curr = next;
+ ctor->next = NULL;
+ ctor->pages = atomic ?
+ kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+ ctor->nr = PAGE_SIZE / sizeof(struct page *);
+ ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+ unsigned int nr,
+ erofs_vtptr_t *pages,
+ unsigned int i)
+{
+ ctor->nr = nr;
+ ctor->curr = ctor->next = NULL;
+ ctor->pages = pages;
+
+ if (i >= nr) {
+ i -= nr;
+ z_erofs_pagevec_ctor_pagedown(ctor, false);
+ while (i > ctor->nr) {
+ i -= ctor->nr;
+ z_erofs_pagevec_ctor_pagedown(ctor, false);
+ }
+ }
+ ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+ ctor->index = i;
+}
+
+static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
+ struct page *page,
+ enum z_erofs_page_type type,
+ bool *occupied)
+{
+ *occupied = false;
+ if (!ctor->next && type)
+ if (ctor->index + 1 == ctor->nr)
+ return false;
+
+ if (ctor->index >= ctor->nr)
+ z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+ /* exclusive page type must be 0 */
+ if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+ __bad_page_type_exclusive();
+
+ /* should remind that collector->next never equal to 1, 2 */
+ if (type == (uintptr_t)ctor->next) {
+ ctor->next = page;
+ *occupied = true;
+ }
+ ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
+ return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
+ enum z_erofs_page_type *type)
+{
+ erofs_vtptr_t t;
+
+ if (ctor->index >= ctor->nr) {
+ DBG_BUGON(!ctor->next);
+ z_erofs_pagevec_ctor_pagedown(ctor, true);
+ }
+
+ t = ctor->pages[ctor->index];
+
+ *type = tagptr_unfold_tags(t);
+
+ /* should remind that collector->next never equal to 1, 2 */
+ if (*type == (uintptr_t)ctor->next)
+ ctor->next = tagptr_unfold_ptr(t);
+
+ ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
+ return tagptr_unfold_ptr(t);
+}
+#endif
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d7f1f5011fac..c4159bcc05d9 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1459,13 +1459,13 @@ static int ep_create_wakeup_source(struct epitem *epi)
struct wakeup_source *ws;
if (!epi->ep->ws) {
- epi->ep->ws = wakeup_source_register("eventpoll");
+ epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
if (!epi->ep->ws)
return -ENOMEM;
}
name = epi->ffd.file->f_path.dentry->d_name.name;
- ws = wakeup_source_register(name);
+ ws = wakeup_source_register(NULL, name);
if (!ws)
return -ENOMEM;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index f0e549783caf..09bc68708d28 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -7,7 +7,7 @@
* and for mapping back from file handles to dentries.
*
* For details on why we do all the strange and hairy things in here
- * take a look at Documentation/filesystems/nfs/Exporting.
+ * take a look at Documentation/filesystems/nfs/exporting.rst.
*/
#include <linux/exportfs.h>
#include <linux/fs.h>
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3c3e04..b17ddc229ac5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,3 +13,4 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa7a9e0..9c7f4036021b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -41,6 +41,7 @@
#endif
#include <linux/fscrypt.h>
+#include <linux/fsverity.h>
#include <linux/compiler.h>
@@ -395,6 +396,7 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
@@ -402,7 +404,7 @@ struct flex_groups {
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
@@ -467,6 +469,7 @@ enum {
EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
@@ -512,6 +515,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(TOPDIR);
CHECK_FLAG_VALUE(HUGE_FILE);
CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
@@ -1560,6 +1564,7 @@ enum {
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
+ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1610,6 +1615,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_SB(sb) (sb)
#endif
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+ return IS_ENABLED(CONFIG_FS_VERITY) &&
+ ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1662,6 +1673,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1756,6 +1768,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -1813,7 +1826,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
- EXT4_FEATURE_RO_COMPAT_PROJECT)
+ EXT4_FEATURE_RO_COMPAT_PROJECT |\
+ EXT4_FEATURE_RO_COMPAT_VERITY)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -3177,6 +3191,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
extern int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
@@ -3283,6 +3299,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 70b0438dbc94..b8a20bb9a145 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -457,6 +457,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ret)
return ret;
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
+
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..d0dc0e3463db 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1340,6 +1340,9 @@ retry_journal:
}
if (ret) {
+ bool extended = (pos + len > inode->i_size) &&
+ !ext4_verity_in_progress(inode);
+
unlock_page(page);
/*
* __block_write_begin may have instantiated a few blocks
@@ -1349,11 +1352,11 @@ retry_journal:
* Add inode to orphan list in case we crash before
* truncate finishes
*/
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (extended && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
- if (pos + len > inode->i_size) {
+ if (extended) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
@@ -1406,6 +1409,7 @@ static int ext4_write_end(struct file *file,
int ret = 0, ret2;
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
if (inline_data) {
@@ -1423,12 +1427,16 @@ static int ext4_write_end(struct file *file,
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
+ *
+ * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+ * blocks are being written past EOF, so skip the i_size update.
*/
- i_size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1439,7 +1447,7 @@ static int ext4_write_end(struct file *file,
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1450,7 +1458,7 @@ errout:
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1511,6 +1519,7 @@ static int ext4_journalled_write_end(struct file *file,
unsigned from, to;
int size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_SIZE - 1);
@@ -1540,13 +1549,14 @@ static int ext4_journalled_write_end(struct file *file,
if (!partial)
SetPageUptodate(page);
}
- size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
if (size_changed || inline_data) {
@@ -1555,7 +1565,7 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1566,7 +1576,7 @@ errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -2162,7 +2172,8 @@ static int ext4_writepage(struct page *page,
trace_ext4_writepage(page);
size = i_size_read(inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2246,7 +2257,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
* after page tables are updated.
*/
size = i_size_read(mpd->inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(mpd->inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2345,6 +2357,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
>> inode->i_blkbits;
+ if (ext4_verity_in_progress(inode))
+ blocks = EXT_MAX_BLOCKS;
+
do {
BUG_ON(buffer_locked(bh));
@@ -3061,8 +3076,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) ||
- S_ISLNK(inode->i_mode)) {
+ if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+ ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
len, flags, pagep, fsdata);
@@ -3897,6 +3912,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return 0;
#endif
+ if (fsverity_active(inode))
+ return 0;
/*
* If we are doing data journalling we don't support O_DIRECT
@@ -4586,7 +4603,6 @@ static int __ext4_get_inode_loc(struct inode *inode,
struct buffer_head *bh;
struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
- struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
@@ -4675,7 +4691,6 @@ make_io:
* If we need to do any I/O, try to pre-readahead extra
* blocks from the inode table.
*/
- blk_start_plug(&plug);
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
@@ -4706,7 +4721,6 @@ make_io:
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -4739,6 +4753,8 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
+ return false;
return true;
}
@@ -4763,9 +4779,11 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
new_fl |= S_CASEFOLD;
+ if (flags & EXT4_VERITY_FL)
+ new_fl |= S_VERITY;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
- S_ENCRYPTED|S_CASEFOLD);
+ S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5555,6 +5573,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ error = fsverity_prepare_setattr(dentry, attr);
+ if (error)
+ return error;
+
if (is_quota_modification(inode, attr)) {
error = dquot_initialize(inode);
if (error)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 442f7ef873fc..5444d49cbf09 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1113,8 +1113,35 @@ resizefs_out:
#endif
}
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key_all_users(filp,
+ (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -1171,6 +1198,17 @@ out:
}
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
+
+ case FS_IOC_ENABLE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+ case FS_IOC_MEASURE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -1231,8 +1269,15 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_SET_ENCRYPTION_POLICY:
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c916017db334..a30b203fa461 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -47,13 +47,103 @@
#include "ext4.h"
-static inline bool ext4_bio_encrypted(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS 128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+ STEP_INITIAL = 0,
+ STEP_DECRYPT,
+ STEP_VERITY,
+};
+
+struct bio_post_read_ctx {
+ struct bio *bio;
+ struct work_struct work;
+ unsigned int cur_step;
+ unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
{
-#ifdef CONFIG_FS_ENCRYPTION
- return unlikely(bio->bi_private != NULL);
-#else
- return false;
-#endif
+ struct page *page;
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ page = bv->bv_page;
+
+ /* PG_error was set if any post_read step failed */
+ if (bio->bi_status || PageError(page)) {
+ ClearPageUptodate(page);
+ /* will re-read again later */
+ ClearPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fscrypt_decrypt_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fsverity_verify_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+ /*
+ * We use different work queues for decryption and for verity because
+ * verity may require reading metadata pages that need decryption, and
+ * we shouldn't recurse to the same workqueue.
+ */
+ switch (++ctx->cur_step) {
+ case STEP_DECRYPT:
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+ INIT_WORK(&ctx->work, decrypt_work);
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ case STEP_VERITY:
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ default:
+ __read_end_io(ctx->bio);
+ }
+}
+
+static bool bio_post_read_required(struct bio *bio)
+{
+ return bio->bi_private && !bio->bi_status;
}
/*
@@ -70,30 +160,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
*/
static void mpage_end_io(struct bio *bio)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ if (bio_post_read_required(bio)) {
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- if (ext4_bio_encrypted(bio)) {
- if (bio->bi_status) {
- fscrypt_release_ctx(bio->bi_private);
- } else {
- fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
- return;
- }
+ ctx->cur_step = STEP_INITIAL;
+ bio_post_read_processing(ctx);
+ return;
}
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
+ __read_end_io(bio);
+}
- if (!bio->bi_status) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
+ struct bio *bio,
+ pgoff_t first_idx)
+{
+ unsigned int post_read_steps = 0;
+ struct bio_post_read_ctx *ctx = NULL;
+
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ post_read_steps |= 1 << STEP_DECRYPT;
+
+ if (ext4_need_verity(inode, first_idx))
+ post_read_steps |= 1 << STEP_VERITY;
+
+ if (post_read_steps) {
+ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->bio = bio;
+ ctx->enabled_steps = post_read_steps;
+ bio->bi_private = ctx;
}
+ return ctx;
+}
- bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_VERITY) &&
+ (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+ return inode->i_sb->s_maxbytes;
+
+ return i_size_read(inode);
}
int ext4_mpage_readpages(struct address_space *mapping,
@@ -141,7 +254,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (ext4_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -218,6 +332,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
zero_user_segment(page, first_hole << blkbits,
PAGE_SIZE);
if (first_hole == 0) {
+ if (ext4_need_verity(inode, page->index) &&
+ !fsverity_verify_page(page))
+ goto set_error_page;
SetPageUptodate(page);
unlock_page(page);
goto next_page;
@@ -241,18 +358,16 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio = NULL;
}
if (bio == NULL) {
- struct fscrypt_ctx *ctx = NULL;
+ struct bio_post_read_ctx *ctx;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
- ctx = fscrypt_get_ctx(GFP_NOFS);
- if (IS_ERR(ctx))
- goto set_error_page;
- }
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
+ if (!bio)
+ goto set_error_page;
+ ctx = get_bio_post_read_ctx(inode, bio, page->index);
+ if (IS_ERR(ctx)) {
+ bio_put(bio);
+ bio = NULL;
goto set_error_page;
}
bio_set_dev(bio, bdev);
@@ -293,3 +408,29 @@ int ext4_mpage_readpages(struct address_space *mapping,
submit_bio(bio);
return 0;
}
+
+int __init ext4_init_post_read_processing(void)
+{
+ bio_post_read_ctx_cache =
+ kmem_cache_create("ext4_bio_post_read_ctx",
+ sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ if (!bio_post_read_ctx_cache)
+ goto fail;
+ bio_post_read_ctx_pool =
+ mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+ bio_post_read_ctx_cache);
+ if (!bio_post_read_ctx_pool)
+ goto fail_free_cache;
+ return 0;
+
+fail_free_cache:
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+ return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+ mempool_destroy(bio_post_read_ctx_pool);
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..27cd622676e7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1107,6 +1107,9 @@ static int ext4_drop_inode(struct inode *inode)
{
int drop = generic_drop_inode(inode);
+ if (!drop)
+ drop = fscrypt_drop_inode(inode);
+
trace_ext4_drop_inode(inode, drop);
return drop;
}
@@ -1179,6 +1182,7 @@ void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->jinode = NULL;
}
fscrypt_put_encryption_info(inode);
+ fsverity_cleanup_inode(inode);
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -4272,6 +4276,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &ext4_verityops;
+#endif
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -4419,6 +4426,11 @@ no_journal:
goto failed_mount_wq;
}
+ if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
+ goto failed_mount_wq;
+ }
+
if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
!ext4_has_feature_encrypt(sb)) {
ext4_set_feature_encrypt(sb);
@@ -6095,6 +6107,10 @@ static int __init ext4_init_fs(void)
err = ext4_init_pending();
if (err)
+ goto out7;
+
+ err = ext4_init_post_read_processing();
+ if (err)
goto out6;
err = ext4_init_pageio();
@@ -6135,8 +6151,10 @@ out3:
out4:
ext4_exit_pageio();
out5:
- ext4_exit_pending();
+ ext4_exit_post_read_processing();
out6:
+ ext4_exit_pending();
+out7:
ext4_exit_es();
return err;
@@ -6153,6 +6171,7 @@ static void __exit ext4_exit_fs(void)
ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_post_read_processing();
ext4_exit_es();
ext4_exit_pending();
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b3cd7655a6ff..eb1efad0e20a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -242,6 +242,9 @@ EXT4_ATTR_FEATURE(encryption);
#ifdef CONFIG_UNICODE
EXT4_ATTR_FEATURE(casefold);
#endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
static struct attribute *ext4_feat_attrs[] = {
@@ -254,6 +257,9 @@ static struct attribute *ext4_feat_attrs[] = {
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
#endif
+#ifdef CONFIG_FS_VERITY
+ ATTR_LIST(verity),
+#endif
ATTR_LIST(metadata_csum_seed),
NULL,
};
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 000000000000..d0d8a9795dd6
--- /dev/null
+++ b/fs/ext4/verity.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first 64K boundary beyond i_size. This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4. This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs. Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+
+static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
+{
+ return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode. __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+ loff_t pos)
+{
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *addr;
+
+ page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+ NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ addr = kmap_atomic(page);
+ memcpy(buf, addr + offset_in_page(pos), n);
+ kunmap_atomic(addr);
+
+ put_page(page);
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+ loff_t pos)
+{
+ if (pos + count > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *fsdata;
+ void *addr;
+ int res;
+
+ res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+ &page, &fsdata);
+ if (res)
+ return res;
+
+ addr = kmap_atomic(page);
+ memcpy(addr + offset_in_page(pos), buf, n);
+ kunmap_atomic(addr);
+
+ res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+ page, fsdata);
+ if (res < 0)
+ return res;
+ if (res != n)
+ return -EIO;
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_add() */
+ handle_t *handle;
+ int err;
+
+ if (ext4_verity_in_progress(inode))
+ return -EBUSY;
+
+ /*
+ * Since the file was opened readonly, we have to initialize the jbd
+ * inode and quotas here and not rely on ->open() doing it. This must
+ * be done before evicting the inline data.
+ */
+
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
+ return err;
+
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ err = ext4_convert_inline_data(inode);
+ if (err)
+ return err;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_warning_inode(inode,
+ "verity is only allowed on extent-based files");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * ext4 uses the last allocated block to find the verity descriptor, so
+ * we must remove any other blocks past EOF which might confuse things.
+ */
+ err = ext4_truncate(inode);
+ if (err)
+ return err;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ ext4_journal_stop(handle);
+ return err;
+}
+
+/*
+ * ext4 stores the verity descriptor beginning on the next filesystem block
+ * boundary after the Merkle tree. Then, the descriptor size is stored in the
+ * last 4 bytes of the last allocated filesystem block --- which is either the
+ * block in which the descriptor ends, or the next block after that if there
+ * weren't at least 4 bytes remaining.
+ *
+ * We can't simply store the descriptor in an xattr because it *must* be
+ * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
+ * xattrs. Also, if the descriptor includes a large signature blob it may be
+ * too large to store in an xattr without the EA_INODE feature.
+ */
+static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
+ merkle_tree_size, i_blocksize(inode));
+ const u64 desc_end = desc_pos + desc_size;
+ const __le32 desc_size_disk = cpu_to_le32(desc_size);
+ const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
+ i_blocksize(inode)) -
+ sizeof(desc_size_disk);
+ int err;
+
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+ if (err)
+ return err;
+
+ return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_del() */
+ handle_t *handle;
+ int err = 0;
+ int err2;
+
+ if (desc != NULL) {
+ /* Succeeded; write the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+
+ /* Write all pages before clearing VERITY_IN_PROGRESS. */
+ if (!err)
+ err = filemap_write_and_wait(inode->i_mapping);
+ }
+
+ /* If we failed, truncate anything we wrote past i_size. */
+ if (desc == NULL || err)
+ ext4_truncate(inode);
+
+ /*
+ * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+ * deleting the inode from the orphan list, even if something failed.
+ * If everything succeeded, we'll also set the verity bit in the same
+ * transaction.
+ */
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ err2 = ext4_orphan_del(handle, inode);
+ if (err2)
+ goto out_stop;
+
+ if (desc != NULL && !err) {
+ struct ext4_iloc iloc;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+out_stop:
+ ext4_journal_stop(handle);
+ return err ?: err2;
+}
+
+static int ext4_get_verity_descriptor_location(struct inode *inode,
+ size_t *desc_size_ret,
+ u64 *desc_pos_ret)
+{
+ struct ext4_ext_path *path;
+ struct ext4_extent *last_extent;
+ u32 end_lblk;
+ u64 desc_size_pos;
+ __le32 desc_size_disk;
+ u32 desc_size;
+ u64 desc_pos;
+ int err;
+
+ /*
+ * Descriptor size is in last 4 bytes of last allocated block.
+ * See ext4_write_verity_descriptor().
+ */
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
+ return -EFSCORRUPTED;
+ }
+
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ last_extent = path[path->p_depth].p_ext;
+ if (!last_extent) {
+ EXT4_ERROR_INODE(inode, "verity file has no extents");
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return -EFSCORRUPTED;
+ }
+
+ end_lblk = le32_to_cpu(last_extent->ee_block) +
+ ext4_ext_get_actual_len(last_extent);
+ desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if (desc_size_pos < sizeof(desc_size_disk))
+ goto bad;
+ desc_size_pos -= sizeof(desc_size_disk);
+
+ err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+ if (err)
+ return err;
+ desc_size = le32_to_cpu(desc_size_disk);
+
+ /*
+ * The descriptor is stored just before the desc_size_disk, but starting
+ * on a filesystem block boundary.
+ */
+
+ if (desc_size > INT_MAX || desc_size > desc_size_pos)
+ goto bad;
+
+ desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
+ if (desc_pos < ext4_verity_metadata_pos(inode))
+ goto bad;
+
+ *desc_size_ret = desc_size;
+ *desc_pos_ret = desc_pos;
+ return 0;
+
+bad:
+ EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
+ return -EFSCORRUPTED;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ size_t desc_size = 0;
+ u64 desc_pos = 0;
+ int err;
+
+ err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
+ if (err)
+ return err;
+
+ if (buf_size) {
+ if (desc_size > buf_size)
+ return -ERANGE;
+ err = pagecache_read(inode, buf, desc_size, desc_pos);
+ if (err)
+ return err;
+ }
+ return desc_size;
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index)
+{
+ index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+ return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+
+ return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+ .begin_enable_verity = ext4_begin_enable_verity,
+ .end_enable_verity = ext4_end_enable_verity,
+ .get_verity_descriptor = ext4_get_verity_descriptor,
+ .read_merkle_tree_page = ext4_read_merkle_tree_page,
+ .write_merkle_tree_block = ext4_write_merkle_tree_block,
+};
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 776c4b936504..2aaecc63834f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -8,3 +8,4 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
+f2fs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index abbf14e9bd72..54cad80acb7d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -74,6 +74,7 @@ static enum count_type __read_io_type(struct page *page)
enum bio_post_read_step {
STEP_INITIAL = 0,
STEP_DECRYPT,
+ STEP_VERITY,
};
struct bio_post_read_ctx {
@@ -120,8 +121,23 @@ static void decrypt_work(struct work_struct *work)
bio_post_read_processing(ctx);
}
+static void verity_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fsverity_verify_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
{
+ /*
+ * We use different work queues for decryption and for verity because
+ * verity may require reading metadata pages that need decryption, and
+ * we shouldn't recurse to the same workqueue.
+ */
switch (++ctx->cur_step) {
case STEP_DECRYPT:
if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
@@ -131,6 +147,14 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
}
ctx->cur_step++;
/* fall-through */
+ case STEP_VERITY:
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
default:
__read_end_io(ctx->bio);
}
@@ -608,8 +632,15 @@ out:
up_write(&io->io_rwsem);
}
+static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
- unsigned nr_pages, unsigned op_flag)
+ unsigned nr_pages, unsigned op_flag,
+ pgoff_t first_idx)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
@@ -625,6 +656,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
if (f2fs_encrypted_file(inode))
post_read_steps |= 1 << STEP_DECRYPT;
+
+ if (f2fs_need_verity(inode, first_idx))
+ post_read_steps |= 1 << STEP_VERITY;
+
if (post_read_steps) {
ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
if (!ctx) {
@@ -646,7 +681,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0);
+ bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1569,6 +1604,15 @@ out:
return ret;
}
+static inline loff_t f2fs_readpage_limit(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_VERITY) &&
+ (IS_VERITY(inode) || f2fs_verity_in_progress(inode)))
+ return inode->i_sb->s_maxbytes;
+
+ return i_size_read(inode);
+}
+
static int f2fs_read_single_page(struct inode *inode, struct page *page,
unsigned nr_pages,
struct f2fs_map_blocks *map,
@@ -1587,7 +1631,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
block_in_file = (sector_t)page_index(page);
last_block = block_in_file + nr_pages;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >>
+ last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >>
blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -1632,6 +1676,11 @@ got_it:
} else {
zero_out:
zero_user_segment(page, 0, PAGE_SIZE);
+ if (f2fs_need_verity(inode, page->index) &&
+ !fsverity_verify_page(page)) {
+ ret = -EIO;
+ goto out;
+ }
if (!PageUptodate(page))
SetPageUptodate(page);
unlock_page(page);
@@ -1650,7 +1699,7 @@ submit_and_realloc:
}
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- is_readahead ? REQ_RAHEAD : 0);
+ is_readahead ? REQ_RAHEAD : 0, page->index);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2052,7 +2101,7 @@ static int __write_data_page(struct page *page, bool *submitted,
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
- if (page->index < end_index)
+ if (page->index < end_index || f2fs_verity_in_progress(inode))
goto write;
/*
@@ -2427,7 +2476,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
- if (to > i_size) {
+ /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
+ if (to > i_size && !f2fs_verity_in_progress(inode)) {
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
down_write(&F2FS_I(inode)->i_mmap_sem);
@@ -2458,7 +2508,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
* the block addresses when there is no need to fill the page.
*/
if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
- !is_inode_flag_set(inode, FI_NO_PREALLOC))
+ !is_inode_flag_set(inode, FI_NO_PREALLOC) &&
+ !f2fs_verity_in_progress(inode))
return 0;
/* f2fs_lock_op avoids race between write CP and convert_inline_page */
@@ -2597,7 +2648,8 @@ repeat:
if (len == PAGE_SIZE || PageUptodate(page))
return 0;
- if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) {
+ if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
+ !f2fs_verity_in_progress(inode)) {
zero_user_segment(page, len, PAGE_SIZE);
return 0;
}
@@ -2660,7 +2712,8 @@ static int f2fs_write_end(struct file *file,
set_page_dirty(page);
- if (pos + copied > i_size_read(inode))
+ if (pos + copied > i_size_read(inode) &&
+ !f2fs_verity_in_progress(inode))
f2fs_i_size_write(inode, pos + copied);
unlock_out:
f2fs_put_page(page, 1);
@@ -3104,7 +3157,9 @@ void f2fs_clear_page_cache_dirty_tag(struct page *page)
int __init f2fs_init_post_read_processing(void)
{
- bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0);
+ bio_post_read_ctx_cache =
+ kmem_cache_create("f2fs_bio_post_read_ctx",
+ sizeof(struct bio_post_read_ctx), 0, 0, NULL);
if (!bio_post_read_ctx_cache)
goto fail;
bio_post_read_ctx_pool =
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 17382da7f0bd..7c5f121edac5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -25,6 +25,7 @@
#include <crypto/hash.h>
#include <linux/fscrypt.h>
+#include <linux/fsverity.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
@@ -151,7 +152,7 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_QUOTA_INO 0x0080
#define F2FS_FEATURE_INODE_CRTIME 0x0100
#define F2FS_FEATURE_LOST_FOUND 0x0200
-#define F2FS_FEATURE_VERITY 0x0400 /* reserved */
+#define F2FS_FEATURE_VERITY 0x0400
#define F2FS_FEATURE_SB_CHKSUM 0x0800
#define __F2FS_HAS_FEATURE(raw_super, mask) \
@@ -630,7 +631,7 @@ enum {
#define FADVISE_ENC_NAME_BIT 0x08
#define FADVISE_KEEP_SIZE_BIT 0x10
#define FADVISE_HOT_BIT 0x20
-#define FADVISE_VERITY_BIT 0x40 /* reserved */
+#define FADVISE_VERITY_BIT 0x40
#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT)
@@ -650,6 +651,8 @@ enum {
#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT)
#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT)
#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT)
+#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT)
+#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT)
#define DEF_DIR_LEVEL 0
@@ -2412,6 +2415,7 @@ enum {
FI_PROJ_INHERIT, /* indicate file inherits projectid */
FI_PIN_FILE, /* indicate file should not be gced */
FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
};
static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2451,6 +2455,12 @@ static inline void clear_inode_flag(struct inode *inode, int flag)
__mark_inode_dirty_flag(inode, flag, false);
}
+static inline bool f2fs_verity_in_progress(struct inode *inode)
+{
+ return IS_ENABLED(CONFIG_FS_VERITY) &&
+ is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS);
+}
+
static inline void set_acl_inode(struct inode *inode, umode_t mode)
{
F2FS_I(inode)->i_acl_mode = mode;
@@ -3521,6 +3531,9 @@ void f2fs_exit_sysfs(void);
int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
+/* verity.c */
+extern const struct fsverity_operations f2fs_verityops;
+
/*
* crypto support
*/
@@ -3543,7 +3556,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
*/
static inline bool f2fs_post_read_required(struct inode *inode)
{
- return f2fs_encrypted_file(inode);
+ return f2fs_encrypted_file(inode) || fsverity_active(inode);
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
@@ -3561,6 +3574,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
+F2FS_FEATURE_FUNCS(verity, VERITY);
F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
#ifdef CONFIG_BLK_DEV_ZONED
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3e58a6f697dd..39fffc19e00c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -496,6 +496,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
+ err = fsverity_file_open(inode, filp);
+ if (err)
+ return err;
+
filp->f_mode |= FMODE_NOWAIT;
return dquot_file_open(inode, filp);
@@ -778,6 +782,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
+ err = fsverity_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
if (is_quota_modification(inode, attr)) {
err = dquot_initialize(inode);
if (err)
@@ -1705,7 +1713,8 @@ static const struct {
FS_PROJINHERIT_FL | \
FS_ENCRYPT_FL | \
FS_INLINE_DATA_FL | \
- FS_NOCOW_FL)
+ FS_NOCOW_FL | \
+ FS_VERITY_FL)
#define F2FS_SETTABLE_FS_FL ( \
FS_SYNC_FL | \
@@ -1750,6 +1759,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
if (IS_ENCRYPTED(inode))
fsflags |= FS_ENCRYPT_FL;
+ if (IS_VERITY(inode))
+ fsflags |= FS_VERITY_FL;
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
fsflags |= FS_INLINE_DATA_FL;
if (is_inode_flag_set(inode, FI_PIN_FILE))
@@ -2184,6 +2195,49 @@ out_err:
return err;
}
+static int f2fs_ioc_get_encryption_policy_ex(struct file *filp,
+ unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_add_encryption_key(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_remove_encryption_key(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_remove_encryption_key_all_users(struct file *filp,
+ unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_get_encryption_key_status(struct file *filp,
+ unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+}
+
static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -3060,6 +3114,30 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg)
return ret;
}
+static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+ if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) {
+ f2fs_warn(F2FS_I_SB(inode),
+ "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n",
+ inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+}
+
+static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3092,6 +3170,16 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_get_encryption_policy(filp, arg);
case F2FS_IOC_GET_ENCRYPTION_PWSALT:
return f2fs_ioc_get_encryption_pwsalt(filp, arg);
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ return f2fs_ioc_get_encryption_policy_ex(filp, arg);
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ return f2fs_ioc_add_encryption_key(filp, arg);
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ return f2fs_ioc_remove_encryption_key(filp, arg);
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ return f2fs_ioc_remove_encryption_key_all_users(filp, arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ return f2fs_ioc_get_encryption_key_status(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT:
return f2fs_ioc_gc(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
@@ -3118,6 +3206,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_precache_extents(filp, arg);
case F2FS_IOC_RESIZE_FS:
return f2fs_ioc_resize_fs(filp, arg);
+ case FS_IOC_ENABLE_VERITY:
+ return f2fs_ioc_enable_verity(filp, arg);
+ case FS_IOC_MEASURE_VERITY:
+ return f2fs_ioc_measure_verity(filp, arg);
default:
return -ENOTTY;
}
@@ -3219,6 +3311,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_SET_ENCRYPTION_POLICY:
case F2FS_IOC_GET_ENCRYPTION_PWSALT:
case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
case F2FS_IOC_GARBAGE_COLLECT:
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
case F2FS_IOC_WRITE_CHECKPOINT:
@@ -3232,6 +3329,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_SET_PIN_FILE:
case F2FS_IOC_PRECACHE_EXTENTS:
case F2FS_IOC_RESIZE_FS:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index a33d7a849b2d..06da75d418e0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -46,9 +46,11 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_DIRSYNC;
if (file_is_encrypt(inode))
new_fl |= S_ENCRYPTED;
+ if (file_is_verity(inode))
+ new_fl |= S_VERITY;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
- S_ENCRYPTED);
+ S_ENCRYPTED|S_VERITY);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -733,6 +735,7 @@ no_delete:
}
out_clear:
fscrypt_put_encryption_info(inode);
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 78a1b873e48a..f43befda0e1a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -913,6 +913,8 @@ static int f2fs_drop_inode(struct inode *inode)
return 0;
}
ret = generic_drop_inode(inode);
+ if (!ret)
+ ret = fscrypt_drop_inode(inode);
trace_f2fs_drop_inode(inode, ret);
return ret;
}
@@ -3144,6 +3146,9 @@ try_onemore:
#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &f2fs_cryptops;
#endif
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &f2fs_verityops;
+#endif
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 3aeacd0aacfd..0cd64f994068 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -131,6 +131,9 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_lost_found(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "lost_found");
+ if (f2fs_sb_has_verity(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "verity");
if (f2fs_sb_has_sb_chksum(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "sb_checksum");
@@ -364,6 +367,7 @@ enum feat_id {
FEAT_QUOTA_INO,
FEAT_INODE_CRTIME,
FEAT_LOST_FOUND,
+ FEAT_VERITY,
FEAT_SB_CHECKSUM,
};
@@ -381,6 +385,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
case FEAT_QUOTA_INO:
case FEAT_INODE_CRTIME:
case FEAT_LOST_FOUND:
+ case FEAT_VERITY:
case FEAT_SB_CHECKSUM:
return snprintf(buf, PAGE_SIZE, "supported\n");
}
@@ -470,6 +475,9 @@ F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
+#ifdef CONFIG_FS_VERITY
+F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
+#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
@@ -534,6 +542,9 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(quota_ino),
ATTR_LIST(inode_crtime),
ATTR_LIST(lost_found),
+#ifdef CONFIG_FS_VERITY
+ ATTR_LIST(verity),
+#endif
ATTR_LIST(sb_checksum),
NULL,
};
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
new file mode 100644
index 000000000000..a401ef72bc82
--- /dev/null
+++ b/fs/f2fs/verity.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/f2fs/verity.c: fs-verity support for f2fs
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for f2fs.
+ *
+ * Like ext4, f2fs stores the verity metadata (Merkle tree and
+ * fsverity_descriptor) past the end of the file, starting at the first 64K
+ * boundary beyond i_size. This approach works because (a) verity files are
+ * readonly, and (b) pages fully beyond i_size aren't visible to userspace but
+ * can be read/written internally by f2fs with only some relatively small
+ * changes to f2fs. Extended attributes cannot be used because (a) f2fs limits
+ * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be
+ * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't
+ * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is
+ * because it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode)
+{
+ return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode. __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+ loff_t pos)
+{
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *addr;
+
+ page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+ NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ addr = kmap_atomic(page);
+ memcpy(buf, addr + offset_in_page(pos), n);
+ kunmap_atomic(addr);
+
+ put_page(page);
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+ loff_t pos)
+{
+ if (pos + count > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *fsdata;
+ void *addr;
+ int res;
+
+ res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+ &page, &fsdata);
+ if (res)
+ return res;
+
+ addr = kmap_atomic(page);
+ memcpy(addr + offset_in_page(pos), buf, n);
+ kunmap_atomic(addr);
+
+ res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+ page, fsdata);
+ if (res < 0)
+ return res;
+ if (res != n)
+ return -EIO;
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+/*
+ * Format of f2fs verity xattr. This points to the location of the verity
+ * descriptor within the file data rather than containing it directly because
+ * the verity descriptor *must* be encrypted when f2fs encryption is used. But,
+ * f2fs encryption does not encrypt xattrs.
+ */
+struct fsverity_descriptor_location {
+ __le32 version;
+ __le32 size;
+ __le64 pos;
+};
+
+static int f2fs_begin_enable_verity(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int err;
+
+ if (f2fs_verity_in_progress(inode))
+ return -EBUSY;
+
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ return -EOPNOTSUPP;
+
+ /*
+ * Since the file was opened readonly, we have to initialize the quotas
+ * here and not rely on ->open() doing it. This must be done before
+ * evicting the inline data.
+ */
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+
+ set_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+ return 0;
+}
+
+static int f2fs_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct inode *inode = file_inode(filp);
+ u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
+ struct fsverity_descriptor_location dloc = {
+ .version = cpu_to_le32(1),
+ .size = cpu_to_le32(desc_size),
+ .pos = cpu_to_le64(desc_pos),
+ };
+ int err = 0;
+
+ if (desc != NULL) {
+ /* Succeeded; write the verity descriptor. */
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+
+ /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
+ if (!err)
+ err = filemap_write_and_wait(inode->i_mapping);
+ }
+
+ /* If we failed, truncate anything we wrote past i_size. */
+ if (desc == NULL || err)
+ f2fs_truncate(inode);
+
+ clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+
+ if (desc != NULL && !err) {
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
+ F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
+ NULL, XATTR_CREATE);
+ if (!err) {
+ file_set_verity(inode);
+ f2fs_set_inode_flags(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
+ }
+ return err;
+}
+
+static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ struct fsverity_descriptor_location dloc;
+ int res;
+ u32 size;
+ u64 pos;
+
+ /* Get the descriptor location */
+ res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY,
+ F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL);
+ if (res < 0 && res != -ERANGE)
+ return res;
+ if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) {
+ f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format");
+ return -EINVAL;
+ }
+ size = le32_to_cpu(dloc.size);
+ pos = le64_to_cpu(dloc.pos);
+
+ /* Get the descriptor */
+ if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
+ pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) {
+ f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr");
+ return -EFSCORRUPTED;
+ }
+ if (buf_size) {
+ if (size > buf_size)
+ return -ERANGE;
+ res = pagecache_read(inode, buf, size, pos);
+ if (res)
+ return res;
+ }
+ return size;
+}
+
+static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index)
+{
+ index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+ return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize);
+
+ return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations f2fs_verityops = {
+ .begin_enable_verity = f2fs_begin_enable_verity,
+ .end_enable_verity = f2fs_end_enable_verity,
+ .get_verity_descriptor = f2fs_get_verity_descriptor,
+ .read_merkle_tree_page = f2fs_read_merkle_tree_page,
+ .write_merkle_tree_block = f2fs_write_merkle_tree_block,
+};
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index a90920e2f949..de0c600b9cab 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -34,8 +34,10 @@
#define F2FS_XATTR_INDEX_ADVISE 7
/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */
#define F2FS_XATTR_INDEX_ENCRYPTION 9
+#define F2FS_XATTR_INDEX_VERITY 11
#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+#define F2FS_XATTR_NAME_VERITY "v"
struct f2fs_xattr_header {
__le32 h_magic; /* magic number for identification */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 542b02d170f8..8aaa7eec7b74 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -36,10 +36,6 @@
*/
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
-struct wb_completion {
- atomic_t cnt;
-};
-
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
@@ -61,19 +57,6 @@ struct wb_writeback_work {
};
/*
- * If one wants to wait for one or more wb_writeback_works, each work's
- * ->done should be set to a wb_completion defined using the following
- * macro. Once all work items are issued with wb_queue_work(), the caller
- * can wait for the completion of all using wb_wait_for_completion(). Work
- * items which are waited upon aren't freed automatically on completion.
- */
-#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
- struct wb_completion cmpl = { \
- .cnt = ATOMIC_INIT(1), \
- }
-
-
-/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
@@ -182,7 +165,7 @@ static void finish_writeback_work(struct bdi_writeback *wb,
if (work->auto_free)
kfree(work);
if (done && atomic_dec_and_test(&done->cnt))
- wake_up_all(&wb->bdi->wb_waitq);
+ wake_up_all(done->waitq);
}
static void wb_queue_work(struct bdi_writeback *wb,
@@ -206,28 +189,44 @@ static void wb_queue_work(struct bdi_writeback *wb,
/**
* wb_wait_for_completion - wait for completion of bdi_writeback_works
- * @bdi: bdi work items were issued to
* @done: target wb_completion
*
* Wait for one or more work items issued to @bdi with their ->done field
- * set to @done, which should have been defined with
- * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
- * work items are completed. Work items which are waited upon aren't freed
+ * set to @done, which should have been initialized with
+ * DEFINE_WB_COMPLETION(). This function returns after all such work items
+ * are completed. Work items which are waited upon aren't freed
* automatically on completion.
*/
-static void wb_wait_for_completion(struct backing_dev_info *bdi,
- struct wb_completion *done)
+void wb_wait_for_completion(struct wb_completion *done)
{
atomic_dec(&done->cnt); /* put down the initial count */
- wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+ wait_event(*done->waitq, !atomic_read(&done->cnt));
}
#ifdef CONFIG_CGROUP_WRITEBACK
-/* parameters for foreign inode detection, see wb_detach_inode() */
+/*
+ * Parameters for foreign inode detection, see wbc_detach_inode() to see
+ * how they're used.
+ *
+ * These paramters are inherently heuristical as the detection target
+ * itself is fuzzy. All we want to do is detaching an inode from the
+ * current owner if it's being written to by some other cgroups too much.
+ *
+ * The current cgroup writeback is built on the assumption that multiple
+ * cgroups writing to the same inode concurrently is very rare and a mode
+ * of operation which isn't well supported. As such, the goal is not
+ * taking too long when a different cgroup takes over an inode while
+ * avoiding too aggressive flip-flops from occasional foreign writes.
+ *
+ * We record, very roughly, 2s worth of IO time history and if more than
+ * half of that is foreign, trigger the switch. The recording is quantized
+ * to 16 slots. To avoid tiny writes from swinging the decision too much,
+ * writes smaller than 1/8 of avg size are ignored.
+ */
#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
-#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
@@ -237,6 +236,7 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
/* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;
@@ -389,6 +389,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
if (unlikely(inode->i_state & I_FREEING))
goto skip_switch;
+ trace_inode_switch_wbs(inode, old_wb, new_wb);
+
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
@@ -489,18 +491,13 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (inode->i_state & I_WB_SWITCH)
return;
- /*
- * Avoid starting new switches while sync_inodes_sb() is in
- * progress. Otherwise, if the down_write protected issue path
- * blocks heavily, we might end up starting a large number of
- * switches which will block on the rwsem.
- */
- if (!down_read_trylock(&bdi->wb_switch_rwsem))
+ /* avoid queueing a new switch if too many are already in flight */
+ if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
if (!isw)
- goto out_unlock;
+ return;
/* find and pin the new wb */
rcu_read_lock();
@@ -534,15 +531,12 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
atomic_inc(&isw_nr_in_flight);
-
- goto out_unlock;
+ return;
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
-out_unlock:
- up_read(&bdi->wb_switch_rwsem);
}
/**
@@ -681,6 +675,9 @@ void wbc_detach_inode(struct writeback_control *wbc)
if (wbc->wb_id != max_id)
history |= (1U << slots) - 1;
+ if (history)
+ trace_inode_foreign_history(inode, wbc, history);
+
/*
* Switch if the current wb isn't the consistent winner.
* If there are multiple closely competing dirtiers, the
@@ -843,7 +840,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
restart:
rcu_read_lock();
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
- DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+ DEFINE_WB_COMPLETION(fallback_work_done, bdi);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;
@@ -890,7 +887,7 @@ restart:
last_wb = wb;
rcu_read_unlock();
- wb_wait_for_completion(bdi, &fallback_work_done);
+ wb_wait_for_completion(&fallback_work_done);
goto restart;
}
rcu_read_unlock();
@@ -900,6 +897,89 @@ restart:
}
/**
+ * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
+ * @bdi_id: target bdi id
+ * @memcg_id: target memcg css id
+ * @nr_pages: number of pages to write, 0 for best-effort dirty flushing
+ * @reason: reason why some writeback work initiated
+ * @done: target wb_completion
+ *
+ * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
+ * with the specified parameters.
+ */
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+ enum wb_reason reason, struct wb_completion *done)
+{
+ struct backing_dev_info *bdi;
+ struct cgroup_subsys_state *memcg_css;
+ struct bdi_writeback *wb;
+ struct wb_writeback_work *work;
+ int ret;
+
+ /* lookup bdi and memcg */
+ bdi = bdi_get_by_id(bdi_id);
+ if (!bdi)
+ return -ENOENT;
+
+ rcu_read_lock();
+ memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
+ if (memcg_css && !css_tryget(memcg_css))
+ memcg_css = NULL;
+ rcu_read_unlock();
+ if (!memcg_css) {
+ ret = -ENOENT;
+ goto out_bdi_put;
+ }
+
+ /*
+ * And find the associated wb. If the wb isn't there already
+ * there's nothing to flush, don't create one.
+ */
+ wb = wb_get_lookup(bdi, memcg_css);
+ if (!wb) {
+ ret = -ENOENT;
+ goto out_css_put;
+ }
+
+ /*
+ * If @nr is zero, the caller is attempting to write out most of
+ * the currently dirty pages. Let's take the current dirty page
+ * count and inflate it by 25% which should be large enough to
+ * flush out most dirty pages while avoiding getting livelocked by
+ * concurrent dirtiers.
+ */
+ if (!nr) {
+ unsigned long filepages, headroom, dirty, writeback;
+
+ mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
+ &writeback);
+ nr = dirty * 10 / 8;
+ }
+
+ /* issue the writeback work */
+ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+ if (work) {
+ work->nr_pages = nr;
+ work->sync_mode = WB_SYNC_NONE;
+ work->range_cyclic = 1;
+ work->reason = reason;
+ work->done = done;
+ work->auto_free = 1;
+ wb_queue_work(wb, work);
+ ret = 0;
+ } else {
+ ret = -ENOMEM;
+ }
+
+ wb_put(wb);
+out_css_put:
+ css_put(memcg_css);
+out_bdi_put:
+ bdi_put(bdi);
+ return ret;
+}
+
+/**
* cgroup_writeback_umount - flush inode wb switches for umount
*
* This function is called when a super_block is about to be destroyed and
@@ -2362,7 +2442,8 @@ static void wait_sb_inodes(struct super_block *sb)
static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
enum wb_reason reason, bool skip_if_busy)
{
- DEFINE_WB_COMPLETION_ONSTACK(done);
+ struct backing_dev_info *bdi = sb->s_bdi;
+ DEFINE_WB_COMPLETION(done, bdi);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_NONE,
@@ -2371,14 +2452,13 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
.nr_pages = nr,
.reason = reason,
};
- struct backing_dev_info *bdi = sb->s_bdi;
if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
- wb_wait_for_completion(bdi, &done);
+ wb_wait_for_completion(&done);
}
/**
@@ -2440,7 +2520,8 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
*/
void sync_inodes_sb(struct super_block *sb)
{
- DEFINE_WB_COMPLETION_ONSTACK(done);
+ struct backing_dev_info *bdi = sb->s_bdi;
+ DEFINE_WB_COMPLETION(done, bdi);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
@@ -2450,7 +2531,6 @@ void sync_inodes_sb(struct super_block *sb)
.reason = WB_REASON_SYNC,
.for_sync = 1,
};
- struct backing_dev_info *bdi = sb->s_bdi;
/*
* Can't skip on !bdi_has_dirty() because we should wait for !dirty
@@ -2464,7 +2544,7 @@ void sync_inodes_sb(struct super_block *sb)
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
- wb_wait_for_completion(bdi, &done);
+ wb_wait_for_completion(&done);
bdi_up_write_wb_switch_rwsem(bdi);
wait_sb_inodes(sb);
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 103643c68e3f..87c2c9687d90 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -279,10 +279,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
break;
case FS_CONTEXT_FOR_RECONFIGURE:
- /* We don't pin any namespaces as the superblock's
- * subscriptions cannot be changed at this point.
- */
atomic_inc(&reference->d_sb->s_active);
+ fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
fc->root = dget(reference);
break;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index cfb48bd088e1..0dadbdbead0f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -75,7 +75,7 @@
#include "internal.h"
-#define IORING_MAX_ENTRIES 4096
+#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_FIXED_FILES 1024
struct io_uring {
@@ -84,27 +84,29 @@ struct io_uring {
};
/*
- * This data is shared with the application through the mmap at offset
- * IORING_OFF_SQ_RING.
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
*
* The offsets to the member fields are published through struct
* io_sqring_offsets when calling io_uring_setup.
*/
-struct io_sq_ring {
+struct io_rings {
/*
* Head and tail offsets into the ring; the offsets need to be
* masked to get valid indices.
*
- * The kernel controls head and the application controls tail.
+ * The kernel controls head of the sq ring and the tail of the cq ring,
+ * and the application controls tail of the sq ring and the head of the
+ * cq ring.
*/
- struct io_uring r;
+ struct io_uring sq, cq;
/*
- * Bitmask to apply to head and tail offsets (constant, equals
+ * Bitmasks to apply to head and tail offsets (constant, equals
* ring_entries - 1)
*/
- u32 ring_mask;
- /* Ring size (constant, power of 2) */
- u32 ring_entries;
+ u32 sq_ring_mask, cq_ring_mask;
+ /* Ring sizes (constant, power of 2) */
+ u32 sq_ring_entries, cq_ring_entries;
/*
* Number of invalid entries dropped by the kernel due to
* invalid index stored in array
@@ -117,7 +119,7 @@ struct io_sq_ring {
* counter includes all submissions that were dropped reaching
* the new SQ head (and possibly more).
*/
- u32 dropped;
+ u32 sq_dropped;
/*
* Runtime flags
*
@@ -127,43 +129,7 @@ struct io_sq_ring {
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
- u32 flags;
- /*
- * Ring buffer of indices into array of io_uring_sqe, which is
- * mmapped by the application using the IORING_OFF_SQES offset.
- *
- * This indirection could e.g. be used to assign fixed
- * io_uring_sqe entries to operations and only submit them to
- * the queue when needed.
- *
- * The kernel modifies neither the indices array nor the entries
- * array.
- */
- u32 array[];
-};
-
-/*
- * This data is shared with the application through the mmap at offset
- * IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_cqring_offsets when calling io_uring_setup.
- */
-struct io_cq_ring {
- /*
- * Head and tail offsets into the ring; the offsets need to be
- * masked to get valid indices.
- *
- * The application controls head and the kernel tail.
- */
- struct io_uring r;
- /*
- * Bitmask to apply to head and tail offsets (constant, equals
- * ring_entries - 1)
- */
- u32 ring_mask;
- /* Ring size (constant, power of 2) */
- u32 ring_entries;
+ u32 sq_flags;
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
@@ -177,7 +143,7 @@ struct io_cq_ring {
* As completion events come in out of order this counter is not
* ordered with any other data.
*/
- u32 overflow;
+ u32 cq_overflow;
/*
* Ring buffer of completion events.
*
@@ -185,7 +151,7 @@ struct io_cq_ring {
* produced, so the application is allowed to modify pending
* entries.
*/
- struct io_uring_cqe cqes[];
+ struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
struct io_mapped_ubuf {
@@ -201,7 +167,7 @@ struct async_list {
struct list_head list;
struct file *file;
- off_t io_end;
+ off_t io_start;
size_t io_len;
};
@@ -215,8 +181,18 @@ struct io_ring_ctx {
bool compat;
bool account_mem;
- /* SQ ring */
- struct io_sq_ring *sq_ring;
+ /*
+ * Ring buffer of indices into array of io_uring_sqe, which is
+ * mmapped by the application using the IORING_OFF_SQES offset.
+ *
+ * This indirection could e.g. be used to assign fixed
+ * io_uring_sqe entries to operations and only submit them to
+ * the queue when needed.
+ *
+ * The kernel modifies neither the indices array nor the entries
+ * array.
+ */
+ u32 *sq_array;
unsigned cached_sq_head;
unsigned sq_entries;
unsigned sq_mask;
@@ -227,15 +203,13 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
/* IO offload */
- struct workqueue_struct *sqo_wq;
+ struct workqueue_struct *sqo_wq[2];
struct task_struct *sqo_thread; /* if using sq thread polling */
struct mm_struct *sqo_mm;
wait_queue_head_t sqo_wait;
struct completion sqo_thread_started;
struct {
- /* CQ ring */
- struct io_cq_ring *cq_ring;
unsigned cached_cq_tail;
unsigned cq_entries;
unsigned cq_mask;
@@ -244,6 +218,8 @@ struct io_ring_ctx {
struct eventfd_ctx *cq_ev_fd;
} ____cacheline_aligned_in_smp;
+ struct io_rings *rings;
+
/*
* If used, fixed file set. Writers must ensure that ->refs is dead,
* readers must ensure that ->refs is alive as long as the file* is
@@ -288,6 +264,7 @@ struct io_ring_ctx {
struct sqe_submit {
const struct io_uring_sqe *sqe;
unsigned short index;
+ u32 sequence;
bool has_user;
bool needs_lock;
bool needs_fixed_file;
@@ -335,6 +312,7 @@ struct io_kiocb {
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_LINK_DONE 128 /* linked sqes done */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
+#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
u64 user_data;
u32 result;
u32 sequence;
@@ -366,6 +344,7 @@ struct io_submit_state {
};
static void io_sq_wq_submit_work(struct work_struct *work);
+static void __io_free_req(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -430,7 +409,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
return false;
- return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
+ return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
}
static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -451,11 +430,11 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
- struct io_cq_ring *ring = ctx->cq_ring;
+ struct io_rings *rings = ctx->rings;
- if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+ if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
/* order cqe stores with ring update */
- smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+ smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
if (wq_has_sleeper(&ctx->cq_wait)) {
wake_up_interruptible(&ctx->cq_wait);
@@ -464,6 +443,24 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
+static inline void io_queue_async_work(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ int rw;
+
+ switch (req->submit.sqe->opcode) {
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ rw = !(req->rw.ki_flags & IOCB_DIRECT);
+ break;
+ default:
+ rw = 0;
+ break;
+ }
+
+ queue_work(ctx->sqo_wq[rw], &req->work);
+}
+
static void io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
@@ -471,14 +468,19 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
+ if (req->flags & REQ_F_SHADOW_DRAIN) {
+ /* Just for drain, free it. */
+ __io_free_req(req);
+ continue;
+ }
req->flags |= REQ_F_IO_DRAINED;
- queue_work(ctx->sqo_wq, &req->work);
+ io_queue_async_work(ctx, req);
}
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
- struct io_cq_ring *ring = ctx->cq_ring;
+ struct io_rings *rings = ctx->rings;
unsigned tail;
tail = ctx->cached_cq_tail;
@@ -487,11 +489,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
- if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
+ if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
return NULL;
ctx->cached_cq_tail++;
- return &ring->cqes[tail & ctx->cq_mask];
+ return &rings->cqes[tail & ctx->cq_mask];
}
static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
@@ -510,9 +512,9 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, 0);
} else {
- unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+ unsigned overflow = READ_ONCE(ctx->rings->cq_overflow);
- WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+ WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1);
}
}
@@ -635,7 +637,7 @@ static void io_req_link_next(struct io_kiocb *req)
nxt->flags |= REQ_F_LINK_DONE;
INIT_WORK(&nxt->work, io_sq_wq_submit_work);
- queue_work(req->ctx->sqo_wq, &nxt->work);
+ io_queue_async_work(req->ctx, nxt);
}
}
@@ -679,11 +681,11 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static unsigned io_cqring_events(struct io_cq_ring *ring)
+static unsigned io_cqring_events(struct io_rings *rings)
{
/* See comment at the top of this file */
smp_rmb();
- return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+ return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
}
/*
@@ -836,7 +838,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (io_cqring_events(ctx->cq_ring))
+ if (io_cqring_events(ctx->rings))
break;
/*
@@ -1187,6 +1189,28 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
}
+static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
+{
+ if (al->file == kiocb->ki_filp) {
+ off_t start, end;
+
+ /*
+ * Allow merging if we're anywhere in the range of the same
+ * page. Generally this happens for sub-page reads or writes,
+ * and it's beneficial to allow the first worker to bring the
+ * page in and the piggy backed work can then work on the
+ * cached page.
+ */
+ start = al->io_start & PAGE_MASK;
+ end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
+ if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
+ return true;
+ }
+
+ al->file = NULL;
+ return false;
+}
+
/*
* Make a note of the last file/offset/direction we punted to async
* context. We'll use this information to see if we can piggy back a
@@ -1198,9 +1222,8 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
struct async_list *async_list = &req->ctx->pending_async[rw];
struct kiocb *kiocb = &req->rw;
struct file *filp = kiocb->ki_filp;
- off_t io_end = kiocb->ki_pos + len;
- if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
+ if (io_should_merge(async_list, kiocb)) {
unsigned long max_bytes;
/* Use 8x RA size as a decent limiter for both reads/writes */
@@ -1213,17 +1236,16 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
req->flags |= REQ_F_SEQ_PREV;
async_list->io_len += len;
} else {
- io_end = 0;
- async_list->io_len = 0;
+ async_list->file = NULL;
}
}
/* New file? Reset state. */
if (async_list->file != filp) {
- async_list->io_len = 0;
+ async_list->io_start = kiocb->ki_pos;
+ async_list->io_len = len;
async_list->file = filp;
}
- async_list->io_end = io_end;
}
static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
@@ -1535,7 +1557,7 @@ static void io_poll_remove_one(struct io_kiocb *req)
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
- queue_work(req->ctx->sqo_wq, &req->work);
+ io_queue_async_work(req->ctx, req);
}
spin_unlock(&poll->head->lock);
@@ -1649,7 +1671,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
io_cqring_ev_posted(ctx);
io_put_req(req);
} else {
- queue_work(ctx->sqo_wq, &req->work);
+ io_queue_async_work(ctx, req);
}
return 1;
@@ -1992,7 +2014,7 @@ out:
*/
static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
{
- bool ret = false;
+ bool ret;
if (!list)
return false;
@@ -2038,10 +2060,14 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
flags = READ_ONCE(s->sqe->flags);
fd = READ_ONCE(s->sqe->fd);
- if (flags & IOSQE_IO_DRAIN) {
+ if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
- req->sequence = ctx->cached_sq_head - 1;
- }
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = s->sequence;
if (!io_op_needs_file(s->sqe))
return 0;
@@ -2063,21 +2089,12 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
return 0;
}
-static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s)
+static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ struct sqe_submit *s, bool force_nonblock)
{
int ret;
- ret = io_req_defer(ctx, req, s->sqe);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
- io_free_req(req);
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
- }
- return 0;
- }
-
- ret = __io_submit_sqe(ctx, req, s, true);
+ ret = __io_submit_sqe(ctx, req, s, force_nonblock);
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
struct io_uring_sqe *sqe_copy;
@@ -2094,7 +2111,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (list)
atomic_inc(&list->cnt);
INIT_WORK(&req->work, io_sq_wq_submit_work);
- queue_work(ctx->sqo_wq, &req->work);
+ io_queue_async_work(ctx, req);
}
/*
@@ -2119,10 +2136,70 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return ret;
}
+static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ struct sqe_submit *s, bool force_nonblock)
+{
+ int ret;
+
+ ret = io_req_defer(ctx, req, s->sqe);
+ if (ret) {
+ if (ret != -EIOCBQUEUED) {
+ io_free_req(req);
+ io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ }
+ return 0;
+ }
+
+ return __io_queue_sqe(ctx, req, s, force_nonblock);
+}
+
+static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ struct sqe_submit *s, struct io_kiocb *shadow,
+ bool force_nonblock)
+{
+ int ret;
+ int need_submit = false;
+
+ if (!shadow)
+ return io_queue_sqe(ctx, req, s, force_nonblock);
+
+ /*
+ * Mark the first IO in link list as DRAIN, let all the following
+ * IOs enter the defer list. all IO needs to be completed before link
+ * list.
+ */
+ req->flags |= REQ_F_IO_DRAIN;
+ ret = io_req_defer(ctx, req, s->sqe);
+ if (ret) {
+ if (ret != -EIOCBQUEUED) {
+ io_free_req(req);
+ io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ return 0;
+ }
+ } else {
+ /*
+ * If ret == 0 means that all IOs in front of link io are
+ * running done. let's queue link head.
+ */
+ need_submit = true;
+ }
+
+ /* Insert shadow req to defer_list, blocking next IOs */
+ spin_lock_irq(&ctx->completion_lock);
+ list_add_tail(&shadow->list, &ctx->defer_list);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ if (need_submit)
+ return __io_queue_sqe(ctx, req, s, force_nonblock);
+
+ return 0;
+}
+
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
- struct io_submit_state *state, struct io_kiocb **link)
+ struct io_submit_state *state, struct io_kiocb **link,
+ bool force_nonblock)
{
struct io_uring_sqe *sqe_copy;
struct io_kiocb *req;
@@ -2175,7 +2252,7 @@ err:
INIT_LIST_HEAD(&req->link_list);
*link = req;
} else {
- io_queue_sqe(ctx, req, s);
+ io_queue_sqe(ctx, req, s, force_nonblock);
}
}
@@ -2205,15 +2282,15 @@ static void io_submit_state_start(struct io_submit_state *state,
static void io_commit_sqring(struct io_ring_ctx *ctx)
{
- struct io_sq_ring *ring = ctx->sq_ring;
+ struct io_rings *rings = ctx->rings;
- if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+ if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
/*
* Ensure any loads from the SQEs are done at this point,
* since once we write the new head, the application could
* write new data to them.
*/
- smp_store_release(&ring->r.head, ctx->cached_sq_head);
+ smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
}
@@ -2227,7 +2304,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
*/
static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
{
- struct io_sq_ring *ring = ctx->sq_ring;
+ struct io_rings *rings = ctx->rings;
+ u32 *sq_array = ctx->sq_array;
unsigned head;
/*
@@ -2240,20 +2318,21 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
*/
head = ctx->cached_sq_head;
/* make sure SQ entry isn't read before tail */
- if (head == smp_load_acquire(&ring->r.tail))
+ if (head == smp_load_acquire(&rings->sq.tail))
return false;
- head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+ head = READ_ONCE(sq_array[head & ctx->sq_mask]);
if (head < ctx->sq_entries) {
s->index = head;
s->sqe = &ctx->sq_sqes[head];
+ s->sequence = ctx->cached_sq_head;
ctx->cached_sq_head++;
return true;
}
/* drop invalid entries */
ctx->cached_sq_head++;
- ring->dropped++;
+ rings->sq_dropped++;
return false;
}
@@ -2262,6 +2341,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
+ struct io_kiocb *shadow_req = NULL;
bool prev_was_link = false;
int i, submitted = 0;
@@ -2276,11 +2356,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
* that's the end of the chain. Submit the previous link.
*/
if (!prev_was_link && link) {
- io_queue_sqe(ctx, link, &link->submit);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req,
+ true);
link = NULL;
}
prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
+ if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) {
+ if (!shadow_req) {
+ shadow_req = io_get_req(ctx, NULL);
+ shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
+ refcount_dec(&shadow_req->refs);
+ }
+ shadow_req->sequence = sqes[i].sequence;
+ }
+
if (unlikely(mm_fault)) {
io_cqring_add_event(ctx, sqes[i].sqe->user_data,
-EFAULT);
@@ -2288,13 +2378,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
sqes[i].has_user = has_user;
sqes[i].needs_lock = true;
sqes[i].needs_fixed_file = true;
- io_submit_sqe(ctx, &sqes[i], statep, &link);
+ io_submit_sqe(ctx, &sqes[i], statep, &link, true);
submitted++;
}
}
if (link)
- io_queue_sqe(ctx, link, &link->submit);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req, true);
if (statep)
io_submit_state_end(&state);
@@ -2366,7 +2456,7 @@ static int io_sq_thread(void *data)
TASK_INTERRUPTIBLE);
/* Tell userspace we may need a wakeup call */
- ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+ ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
/* make sure to read SQ tail after writing flags */
smp_mb();
@@ -2380,12 +2470,12 @@ static int io_sq_thread(void *data)
schedule();
finish_wait(&ctx->sqo_wait, &wait);
- ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
- ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
}
i = 0;
@@ -2426,10 +2516,12 @@ static int io_sq_thread(void *data)
return 0;
}
-static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
+ bool block_for_last)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
+ struct io_kiocb *shadow_req = NULL;
bool prev_was_link = false;
int i, submit = 0;
@@ -2439,6 +2531,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
}
for (i = 0; i < to_submit; i++) {
+ bool force_nonblock = true;
struct sqe_submit s;
if (!io_get_sqring(ctx, &s))
@@ -2449,21 +2542,43 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
* that's the end of the chain. Submit the previous link.
*/
if (!prev_was_link && link) {
- io_queue_sqe(ctx, link, &link->submit);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req,
+ force_nonblock);
link = NULL;
}
prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
+ if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
+ if (!shadow_req) {
+ shadow_req = io_get_req(ctx, NULL);
+ shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
+ refcount_dec(&shadow_req->refs);
+ }
+ shadow_req->sequence = s.sequence;
+ }
+
s.has_user = true;
s.needs_lock = false;
s.needs_fixed_file = false;
submit++;
- io_submit_sqe(ctx, &s, statep, &link);
+
+ /*
+ * The caller will block for events after submit, submit the
+ * last IO non-blocking. This is either the only IO it's
+ * submitting, or it already submitted the previous ones. This
+ * improves performance by avoiding an async punt that we don't
+ * need to do.
+ */
+ if (block_for_last && submit == to_submit)
+ force_nonblock = false;
+
+ io_submit_sqe(ctx, &s, statep, &link, force_nonblock);
}
io_commit_sqring(ctx);
if (link)
- io_queue_sqe(ctx, link, &link->submit);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req,
+ block_for_last);
if (statep)
io_submit_state_end(statep);
@@ -2477,10 +2592,10 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
- struct io_cq_ring *ring = ctx->cq_ring;
+ struct io_rings *rings = ctx->rings;
int ret;
- if (io_cqring_events(ring) >= min_events)
+ if (io_cqring_events(rings) >= min_events)
return 0;
if (sig) {
@@ -2496,12 +2611,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
+ ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events);
restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
- return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+ return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -2551,11 +2666,15 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
static void io_finish_async(struct io_ring_ctx *ctx)
{
+ int i;
+
io_sq_thread_stop(ctx);
- if (ctx->sqo_wq) {
- destroy_workqueue(ctx->sqo_wq);
- ctx->sqo_wq = NULL;
+ for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
+ if (ctx->sqo_wq[i]) {
+ destroy_workqueue(ctx->sqo_wq[i]);
+ ctx->sqo_wq[i] = NULL;
+ }
}
}
@@ -2763,16 +2882,31 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
}
/* Do QD, or 2 * CPUS, whatever is smallest */
- ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
+ ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
+ WQ_UNBOUND | WQ_FREEZABLE,
min(ctx->sq_entries - 1, 2 * num_online_cpus()));
- if (!ctx->sqo_wq) {
+ if (!ctx->sqo_wq[0]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * This is for buffered writes, where we want to limit the parallelism
+ * due to file locking in file systems. As "normal" buffered writes
+ * should parellelize on writeout quite nicely, limit us to having 2
+ * pending. This avoids massive contention on the inode when doing
+ * buffered async writes.
+ */
+ ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
+ WQ_UNBOUND | WQ_FREEZABLE, 2);
+ if (!ctx->sqo_wq[1]) {
ret = -ENOMEM;
goto err;
}
return 0;
err:
- io_sq_thread_stop(ctx);
+ io_finish_async(ctx);
mmdrop(ctx->sqo_mm);
ctx->sqo_mm = NULL;
return ret;
@@ -2821,17 +2955,45 @@ static void *io_mem_alloc(size_t size)
return (void *) __get_free_pages(gfp_flags, get_order(size));
}
+static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
+ size_t *sq_offset)
+{
+ struct io_rings *rings;
+ size_t off, sq_array_size;
+
+ off = struct_size(rings, cqes, cq_entries);
+ if (off == SIZE_MAX)
+ return SIZE_MAX;
+
+#ifdef CONFIG_SMP
+ off = ALIGN(off, SMP_CACHE_BYTES);
+ if (off == 0)
+ return SIZE_MAX;
+#endif
+
+ sq_array_size = array_size(sizeof(u32), sq_entries);
+ if (sq_array_size == SIZE_MAX)
+ return SIZE_MAX;
+
+ if (check_add_overflow(off, sq_array_size, &off))
+ return SIZE_MAX;
+
+ if (sq_offset)
+ *sq_offset = off;
+
+ return off;
+}
+
static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
{
- struct io_sq_ring *sq_ring;
- struct io_cq_ring *cq_ring;
- size_t bytes;
+ size_t pages;
- bytes = struct_size(sq_ring, array, sq_entries);
- bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
- bytes += struct_size(cq_ring, cqes, cq_entries);
+ pages = (size_t)1 << get_order(
+ rings_size(sq_entries, cq_entries, NULL));
+ pages += (size_t)1 << get_order(
+ array_size(sizeof(struct io_uring_sqe), sq_entries));
- return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ return pages;
}
static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
@@ -2845,7 +3007,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
for (j = 0; j < imu->nr_bvecs; j++)
- put_page(imu->bvec[j].bv_page);
+ put_user_page(imu->bvec[j].bv_page);
if (ctx->account_mem)
io_unaccount_mem(ctx->user, imu->nr_bvecs);
@@ -2989,10 +3151,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
* if we did partial map, or found file backed vmas,
* release any pages we did get
*/
- if (pret > 0) {
- for (j = 0; j < pret; j++)
- put_page(pages[j]);
- }
+ if (pret > 0)
+ put_user_pages(pages, pret);
if (ctx->account_mem)
io_unaccount_mem(ctx->user, nr_pages);
kvfree(imu->bvec);
@@ -3078,9 +3238,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
}
#endif
- io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
- io_mem_free(ctx->cq_ring);
percpu_ref_exit(&ctx->refs);
if (ctx->account_mem)
@@ -3101,10 +3260,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* io_commit_cqring
*/
smp_rmb();
- if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
- ctx->sq_ring->ring_entries)
+ if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
+ ctx->rings->sq_ring_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
- if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+ if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail)
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -3149,14 +3308,12 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
switch (offset) {
case IORING_OFF_SQ_RING:
- ptr = ctx->sq_ring;
+ case IORING_OFF_CQ_RING:
+ ptr = ctx->rings;
break;
case IORING_OFF_SQES:
ptr = ctx->sq_sqes;
break;
- case IORING_OFF_CQ_RING:
- ptr = ctx->cq_ring;
- break;
default:
return -EINVAL;
}
@@ -3199,19 +3356,27 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
* Just return the requested submit count, and wake the thread if
* we were asked to.
*/
+ ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sqo_wait);
submitted = to_submit;
- goto out_ctx;
- }
+ } else if (to_submit) {
+ bool block_for_last = false;
- ret = 0;
- if (to_submit) {
to_submit = min(to_submit, ctx->sq_entries);
+ /*
+ * Allow last submission to block in a series, IFF the caller
+ * asked to wait for events and we don't currently have
+ * enough. This potentially avoids an async punt.
+ */
+ if (to_submit == min_complete &&
+ io_cqring_events(ctx->rings) < min_complete)
+ block_for_last = true;
+
mutex_lock(&ctx->uring_lock);
- submitted = io_ring_submit(ctx, to_submit);
+ submitted = io_ring_submit(ctx, to_submit, block_for_last);
mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
@@ -3226,7 +3391,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
}
-out_ctx:
io_ring_drop_ctx_refs(ctx, 1);
out_fput:
fdput(f);
@@ -3243,19 +3407,27 @@ static const struct file_operations io_uring_fops = {
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
- struct io_sq_ring *sq_ring;
- struct io_cq_ring *cq_ring;
- size_t size;
+ struct io_rings *rings;
+ size_t size, sq_array_offset;
- sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
- if (!sq_ring)
+ size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ rings = io_mem_alloc(size);
+ if (!rings)
return -ENOMEM;
- ctx->sq_ring = sq_ring;
- sq_ring->ring_mask = p->sq_entries - 1;
- sq_ring->ring_entries = p->sq_entries;
- ctx->sq_mask = sq_ring->ring_mask;
- ctx->sq_entries = sq_ring->ring_entries;
+ ctx->rings = rings;
+ ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+ rings->sq_ring_mask = p->sq_entries - 1;
+ rings->cq_ring_mask = p->cq_entries - 1;
+ rings->sq_ring_entries = p->sq_entries;
+ rings->cq_ring_entries = p->cq_entries;
+ ctx->sq_mask = rings->sq_ring_mask;
+ ctx->cq_mask = rings->cq_ring_mask;
+ ctx->sq_entries = rings->sq_ring_entries;
+ ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX)
@@ -3265,15 +3437,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (!ctx->sq_sqes)
return -ENOMEM;
- cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
- if (!cq_ring)
- return -ENOMEM;
-
- ctx->cq_ring = cq_ring;
- cq_ring->ring_mask = p->cq_entries - 1;
- cq_ring->ring_entries = p->cq_entries;
- ctx->cq_mask = cq_ring->ring_mask;
- ctx->cq_entries = cq_ring->ring_entries;
return 0;
}
@@ -3377,21 +3540,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
goto err;
memset(&p->sq_off, 0, sizeof(p->sq_off));
- p->sq_off.head = offsetof(struct io_sq_ring, r.head);
- p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
- p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
- p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
- p->sq_off.flags = offsetof(struct io_sq_ring, flags);
- p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
- p->sq_off.array = offsetof(struct io_sq_ring, array);
+ p->sq_off.head = offsetof(struct io_rings, sq.head);
+ p->sq_off.tail = offsetof(struct io_rings, sq.tail);
+ p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
+ p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
+ p->sq_off.flags = offsetof(struct io_rings, sq_flags);
+ p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
+ p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
memset(&p->cq_off, 0, sizeof(p->cq_off));
- p->cq_off.head = offsetof(struct io_cq_ring, r.head);
- p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
- p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
- p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
- p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
- p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+ p->cq_off.head = offsetof(struct io_rings, cq.head);
+ p->cq_off.tail = offsetof(struct io_rings, cq.tail);
+ p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
+ p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
+ p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
+ p->cq_off.cqes = offsetof(struct io_rings, cqes);
+
+ p->features = IORING_FEAT_SINGLE_MMAP;
return ret;
err:
io_ring_ctx_wait_and_kill(ctx);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 85a9093769a9..35768a63fb1d 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -10,7 +10,7 @@
*
* The following files are helpful:
*
- * Documentation/filesystems/nfs/Exporting
+ * Documentation/filesystems/nfs/exporting.rst
* fs/exportfs/expfs.c.
*/
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 22a273bd4648..05cb0e8e4382 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -5,7 +5,7 @@ config JFS_FS
select CRC32
help
This is a port of IBM's Journaled Filesystem . More information is
- available in the file <file:Documentation/filesystems/jfs.txt>.
+ available in the file <file:Documentation/admin-guide/jfs.rst>.
If you do not intend to use the JFS filesystem, say N.
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a387534c9577..6ebae6bbe6a5 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -137,6 +137,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
if (kn_from == kn_to)
return strlcpy(buf, "/", buflen);
+ if (!buf)
+ return -EINVAL;
+
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
return -EINVAL;
@@ -144,8 +147,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
depth_to = kernfs_depth(common, kn_to);
depth_from = kernfs_depth(common, kn_from);
- if (buf)
- buf[0] = '\0';
+ buf[0] = '\0';
for (i = 0; i < depth_from; i++)
len += strlcpy(buf + len, parent_str,
@@ -430,7 +432,6 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
*/
void kernfs_put_active(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
int v;
if (unlikely(!kn))
@@ -442,7 +443,7 @@ void kernfs_put_active(struct kernfs_node *kn)
if (likely(v != KN_DEACTIVATED_BIAS))
return;
- wake_up_all(&root->deactivate_waitq);
+ wake_up_all(&kernfs_root(kn)->deactivate_waitq);
}
/**
diff --git a/fs/locks.c b/fs/locks.c
index 686eae21daf6..a364ebc5cec3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1592,7 +1592,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
WARN_ON_ONCE(1);
- return error;
+ goto free_lock;
}
percpu_down_read(&file_rwsem);
@@ -1672,6 +1672,7 @@ out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
+free_lock:
locks_free_lock(new_fl);
return error;
}
@@ -2784,10 +2785,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
: (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
} else {
- seq_printf(f, "%s ",
- (lease_breaking(fl))
- ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ "
- : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
+ int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+
+ seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+ (type == F_RDLCK) ? "READ" : "UNLCK");
}
if (inode) {
/* userspace relies on this representation of dev_t */
diff --git a/fs/namei.c b/fs/namei.c
index 209c51a5226c..671c3c1a3425 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -596,14 +596,12 @@ static void terminate_walk(struct nameidata *nd)
path_put(&nd->path);
for (i = 0; i < nd->depth; i++)
path_put(&nd->stack[i].link);
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+ if (nd->flags & LOOKUP_ROOT_GRABBED) {
path_put(&nd->root);
- nd->root.mnt = NULL;
+ nd->flags &= ~LOOKUP_ROOT_GRABBED;
}
} else {
nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
rcu_read_unlock();
}
nd->depth = 0;
@@ -641,6 +639,14 @@ static bool legitimize_links(struct nameidata *nd)
return true;
}
+static bool legitimize_root(struct nameidata *nd)
+{
+ if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
+ return true;
+ nd->flags |= LOOKUP_ROOT_GRABBED;
+ return legitimize_path(nd, &nd->root, nd->root_seq);
+}
+
/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
@@ -671,23 +677,18 @@ static int unlazy_walk(struct nameidata *nd)
nd->flags &= ~LOOKUP_RCU;
if (unlikely(!legitimize_links(nd)))
- goto out2;
- if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
goto out1;
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
- goto out;
- }
+ if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
+ goto out;
+ if (unlikely(!legitimize_root(nd)))
+ goto out;
rcu_read_unlock();
BUG_ON(nd->inode != parent->d_inode);
return 0;
-out2:
+out1:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
-out1:
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
out:
rcu_read_unlock();
return -ECHILD;
@@ -727,23 +728,14 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se
*/
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
goto out;
- if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
- rcu_read_unlock();
- dput(dentry);
- goto drop_root_mnt;
- }
+ if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
+ goto out_dput;
/*
* Sequence counts matched. Now make sure that the root is
* still valid and get it if required.
*/
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
- rcu_read_unlock();
- dput(dentry);
- return -ECHILD;
- }
- }
-
+ if (unlikely(!legitimize_root(nd)))
+ goto out_dput;
rcu_read_unlock();
return 0;
@@ -753,9 +745,10 @@ out1:
nd->path.dentry = NULL;
out:
rcu_read_unlock();
-drop_root_mnt:
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
+ return -ECHILD;
+out_dput:
+ rcu_read_unlock();
+ dput(dentry);
return -ECHILD;
}
@@ -819,6 +812,7 @@ static void set_root(struct nameidata *nd)
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_root(fs, &nd->root);
+ nd->flags |= LOOKUP_ROOT_GRABBED;
}
}
@@ -1735,8 +1729,6 @@ static int pick_link(struct nameidata *nd, struct path *link,
nd->flags &= ~LOOKUP_RCU;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
rcu_read_unlock();
} else if (likely(unlazy_walk(nd)) == 0)
error = nd_alloc_stack(nd);
@@ -2350,7 +2342,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
if (likely(!retval))
- audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
+ audit_inode(name, path->dentry, 0);
restore_nameidata();
putname(name);
return retval;
@@ -2391,7 +2383,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name,
if (likely(!retval)) {
*last = nd.last;
*type = nd.last_type;
- audit_inode(name, parent->dentry, LOOKUP_PARENT);
+ audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
} else {
putname(name);
name = ERR_PTR(retval);
@@ -2718,7 +2710,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
if (unlikely(error == -ESTALE))
error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
if (likely(!error))
- audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
+ audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
restore_nameidata();
putname(name);
return error;
@@ -3299,7 +3291,7 @@ static int do_last(struct nameidata *nd,
if (error)
return error;
- audit_inode(nd->name, dir, LOOKUP_PARENT);
+ audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return -EISDIR;
diff --git a/fs/namespace.c b/fs/namespace.c
index d28d30b13043..227f7b343034 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1643,13 +1643,18 @@ static inline bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
+#ifdef CONFIG_MANDATORY_FILE_LOCKING
static inline bool may_mandlock(void)
{
-#ifndef CONFIG_MANDATORY_FILE_LOCKING
- return false;
-#endif
return capable(CAP_SYS_ADMIN);
}
+#else
+static inline bool may_mandlock(void)
+{
+ pr_warn("VFS: \"mand\" mount option not supported");
+ return false;
+}
+#endif
/*
* Now umount can handle mount points as well as block devices.
@@ -1675,8 +1680,6 @@ int ksys_umount(char __user *name, int flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
- lookup_flags |= LOOKUP_NO_EVAL;
-
retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
@@ -3046,7 +3049,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
return -EINVAL;
/* ... and get the mountpoint */
- retval = user_path(dir_name, &path);
+ retval = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
if (retval)
return retval;
@@ -3593,11 +3596,13 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (!may_mount())
return -EPERM;
- error = user_path_dir(new_root, &new);
+ error = user_path_at(AT_FDCWD, new_root,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
goto out0;
- error = user_path_dir(put_old, &old);
+ error = user_path_at(AT_FDCWD, put_old,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
goto out1;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c764cfe456e5..2a03bfeec10a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1403,11 +1403,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
return 0;
- /* No fileid? Just exit */
- if (!(fattr->valid & NFS_ATTR_FATTR_FILEID))
- return 0;
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+ /* Only a mounted-on-fileid? Just exit */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ return 0;
/* Has the inode gone and changed behind our back? */
- if (nfsi->fileid != fattr->fileid) {
+ } else if (nfsi->fileid != fattr->fileid) {
/* Is this perhaps the mounted-on fileid? */
if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
nfsi->fileid == fattr->mounted_on_fileid)
@@ -1807,11 +1808,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_display_fhandle_hash(NFS_FH(inode)),
atomic_read(&inode->i_count), fattr->valid);
- /* No fileid? Just exit */
- if (!(fattr->valid & NFS_ATTR_FATTR_FILEID))
- return 0;
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+ /* Only a mounted-on-fileid? Just exit */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ return 0;
/* Has the inode gone and changed behind our back? */
- if (nfsi->fileid != fattr->fileid) {
+ } else if (nfsi->fileid != fattr->fileid) {
/* Is this perhaps the mounted-on fileid? */
if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
nfsi->fileid == fattr->mounted_on_fileid)
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 976d4089e267..361cc10d6f95 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -207,7 +207,6 @@ TRACE_DEFINE_ENUM(LOOKUP_PARENT);
TRACE_DEFINE_ENUM(LOOKUP_REVAL);
TRACE_DEFINE_ENUM(LOOKUP_RCU);
TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL);
-TRACE_DEFINE_ENUM(LOOKUP_NO_EVAL);
TRACE_DEFINE_ENUM(LOOKUP_OPEN);
TRACE_DEFINE_ENUM(LOOKUP_CREATE);
TRACE_DEFINE_ENUM(LOOKUP_EXCL);
@@ -226,7 +225,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN);
{ LOOKUP_REVAL, "REVAL" }, \
{ LOOKUP_RCU, "RCU" }, \
{ LOOKUP_NO_REVAL, "NO_REVAL" }, \
- { LOOKUP_NO_EVAL, "NO_EVAL" }, \
{ LOOKUP_OPEN, "OPEN" }, \
{ LOOKUP_CREATE, "CREATE" }, \
{ LOOKUP_EXCL, "EXCL" }, \
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3cf4f6aa48d6..2c215171c0eb 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1385,8 +1385,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
static int nfsd_fs_get_tree(struct fs_context *fc)
{
- fc->s_fs_info = get_net(fc->net_ns);
- return vfs_get_super(fc, vfs_get_keyed_super, nfsd_fill_super);
+ return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns));
}
static void nfsd_fs_free_fc(struct fs_context *fc)
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 960f9a3c012d..a5612abc0936 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -555,7 +555,7 @@ static int orangefs_fsync(struct file *file,
* Change the file pointer position for an instance of an open file.
*
* \note If .llseek is overriden, we must acquire lock as described in
- * Documentation/filesystems/Locking.
+ * Documentation/filesystems/locking.rst.
*
* Future upgrade could support SEEK_DATA and SEEK_HOLE but would
* require much changes to the FS
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 572dd29fbd54..34a6c99fa29b 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -246,7 +246,7 @@ struct orangefs_read_options {
extern struct orangefs_stats orangefs_stats;
/*
- * NOTE: See Documentation/filesystems/porting for information
+ * NOTE: See Documentation/filesystems/porting.rst for information
* on implementing FOO_I and properly accessing fs private data
*/
static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 33f72d1b92cc..0b7c8dffc9ae 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -157,8 +157,7 @@ static int proc_get_tree(struct fs_context *fc)
{
struct proc_fs_context *ctx = fc->fs_private;
- fc->s_fs_info = ctx->pid_ns;
- return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super);
+ return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns);
}
static void proc_fs_context_free(struct fs_context *fc)
diff --git a/fs/super.c b/fs/super.c
index 5960578a4076..9459ba75a32e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -32,6 +32,7 @@
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/cleancache.h>
+#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
@@ -290,6 +291,7 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
security_sb_free(s);
+ fscrypt_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -1211,7 +1213,18 @@ int get_tree_single(struct fs_context *fc,
}
EXPORT_SYMBOL(get_tree_single);
+int get_tree_keyed(struct fs_context *fc,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc),
+ void *key)
+{
+ fc->s_fs_info = key;
+ return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_keyed);
+
#ifdef CONFIG_BLOCK
+
static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
@@ -1221,6 +1234,99 @@ static int set_bdev_super(struct super_block *s, void *data)
return 0;
}
+static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+{
+ return set_bdev_super(s, fc->sget_key);
+}
+
+static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+{
+ return s->s_bdev == fc->sget_key;
+}
+
+/**
+ * get_tree_bdev - Get a superblock based on a single block device
+ * @fc: The filesystem context holding the parameters
+ * @fill_super: Helper to initialise a new superblock
+ */
+int get_tree_bdev(struct fs_context *fc,
+ int (*fill_super)(struct super_block *,
+ struct fs_context *))
+{
+ struct block_device *bdev;
+ struct super_block *s;
+ fmode_t mode = FMODE_READ | FMODE_EXCL;
+ int error = 0;
+
+ if (!(fc->sb_flags & SB_RDONLY))
+ mode |= FMODE_WRITE;
+
+ if (!fc->source)
+ return invalf(fc, "No source specified");
+
+ bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type);
+ if (IS_ERR(bdev)) {
+ errorf(fc, "%s: Can't open blockdev", fc->source);
+ return PTR_ERR(bdev);
+ }
+
+ /* Once the superblock is inserted into the list by sget_fc(), s_umount
+ * will protect the lockfs code from trying to start a snapshot while
+ * we are mounting
+ */
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+ return -EBUSY;
+ }
+
+ fc->sb_flags |= SB_NOSEC;
+ fc->sget_key = bdev;
+ s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ if (s->s_root) {
+ /* Don't summarily change the RO/RW state. */
+ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
+ warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+ deactivate_locked_super(s);
+ blkdev_put(bdev, mode);
+ return -EBUSY;
+ }
+
+ /*
+ * s_umount nests inside bd_mutex during
+ * __invalidate_device(). blkdev_put() acquires
+ * bd_mutex and can't be called under s_umount. Drop
+ * s_umount temporarily. This is safe as we're
+ * holding an active reference.
+ */
+ up_write(&s->s_umount);
+ blkdev_put(bdev, mode);
+ down_write(&s->s_umount);
+ } else {
+ s->s_mode = mode;
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ sb_set_blocksize(s, block_size(bdev));
+ error = fill_super(s, fc);
+ if (error) {
+ deactivate_locked_super(s);
+ return error;
+ }
+
+ s->s_flags |= SB_ACTIVE;
+ bdev->bd_super = s;
+ }
+
+ BUG_ON(fc->root);
+ fc->root = dget(s->s_root);
+ return 0;
+}
+EXPORT_SYMBOL(get_tree_bdev);
+
static int test_bdev_super(struct super_block *s, void *data)
{
return (void *)s->s_bdev == data;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a6fc8aa1de7..48305ba41e3c 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -471,7 +471,11 @@ static int do_timerfd_settime(int ufd, int flags,
break;
}
spin_unlock_irq(&ctx->wqh.lock);
- cpu_relax();
+
+ if (isalarm(ctx))
+ hrtimer_cancel_wait_running(&ctx->t.alarm.timer);
+ else
+ hrtimer_cancel_wait_running(&ctx->t.tmr);
}
/*
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 034ad14710d1..5dc5abca11c7 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -185,6 +185,21 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GET_ENCRYPTION_POLICY:
return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ return fscrypt_ioctl_add_key(file, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ return fscrypt_ioctl_remove_key(file, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ return fscrypt_ioctl_remove_key_all_users(file,
+ (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -202,6 +217,11 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
break;
case FS_IOC_SET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 8c1d571334bc..5e1e8ec0589e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -318,6 +318,16 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
+static int ubifs_drop_inode(struct inode *inode)
+{
+ int drop = generic_drop_inode(inode);
+
+ if (!drop)
+ drop = fscrypt_drop_inode(inode);
+
+ return drop;
+}
+
static void ubifs_evict_inode(struct inode *inode)
{
int err;
@@ -1994,6 +2004,7 @@ const struct super_operations ubifs_super_operations = {
.free_inode = ubifs_free_inode,
.put_super = ubifs_put_super,
.write_inode = ubifs_write_inode,
+ .drop_inode = ubifs_drop_inode,
.evict_inode = ubifs_evict_inode,
.statfs = ubifs_statfs,
.dirty_inode = ubifs_dirty_inode,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index fcb41516ea59..6d30adb6b890 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -9,7 +9,7 @@ config UFS_FS
this file system as well. Saying Y here will allow you to read from
these partitions; if you also want to write to them, say Y to the
experimental "UFS file system write support", below. Please read the
- file <file:Documentation/filesystems/ufs.txt> for more information.
+ file <file:Documentation/admin-guide/ufs.rst> for more information.
The recently released UFS2 variant (used in FreeBSD 5.x) is
READ-ONLY supported.
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
new file mode 100644
index 000000000000..88fb25119899
--- /dev/null
+++ b/fs/verity/Kconfig
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config FS_VERITY
+ bool "FS Verity (read-only file-based authenticity protection)"
+ select CRYPTO
+ # SHA-256 is selected as it's intended to be the default hash algorithm.
+ # To avoid bloat, other wanted algorithms must be selected explicitly.
+ select CRYPTO_SHA256
+ help
+ This option enables fs-verity. fs-verity is the dm-verity
+ mechanism implemented at the file level. On supported
+ filesystems (currently EXT4 and F2FS), userspace can use an
+ ioctl to enable verity for a file, which causes the filesystem
+ to build a Merkle tree for the file. The filesystem will then
+ transparently verify any data read from the file against the
+ Merkle tree. The file is also made read-only.
+
+ This serves as an integrity check, but the availability of the
+ Merkle tree root hash also allows efficiently supporting
+ various use cases where normally the whole file would need to
+ be hashed at once, such as: (a) auditing (logging the file's
+ hash), or (b) authenticity verification (comparing the hash
+ against a known good value, e.g. from a digital signature).
+
+ fs-verity is especially useful on large files where not all
+ the contents may actually be needed. Also, fs-verity verifies
+ data each time it is paged back in, which provides better
+ protection against malicious disks vs. an ahead-of-time hash.
+
+ If unsure, say N.
+
+config FS_VERITY_DEBUG
+ bool "FS Verity debugging"
+ depends on FS_VERITY
+ help
+ Enable debugging messages related to fs-verity by default.
+
+ Say N unless you are an fs-verity developer.
+
+config FS_VERITY_BUILTIN_SIGNATURES
+ bool "FS Verity builtin signature support"
+ depends on FS_VERITY
+ select SYSTEM_DATA_VERIFICATION
+ help
+ Support verifying signatures of verity files against the X.509
+ certificates that have been loaded into the ".fs-verity"
+ kernel keyring.
+
+ This is meant as a relatively simple mechanism that can be
+ used to provide an authenticity guarantee for verity files, as
+ an alternative to IMA appraisal. Userspace programs still
+ need to check that the verity bit is set in order to get an
+ authenticity guarantee.
+
+ If unsure, say N.
diff --git a/fs/verity/Makefile b/fs/verity/Makefile
new file mode 100644
index 000000000000..570e9136334d
--- /dev/null
+++ b/fs/verity/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_FS_VERITY) += enable.o \
+ hash_algs.o \
+ init.o \
+ measure.o \
+ open.o \
+ verify.o
+
+obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
new file mode 100644
index 000000000000..eabc6ac19906
--- /dev/null
+++ b/fs/verity/enable.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/enable.c: ioctl to enable verity on a file
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+
+static int build_merkle_tree_level(struct inode *inode, unsigned int level,
+ u64 num_blocks_to_hash,
+ const struct merkle_tree_params *params,
+ u8 *pending_hashes,
+ struct ahash_request *req)
+{
+ const struct fsverity_operations *vops = inode->i_sb->s_vop;
+ unsigned int pending_size = 0;
+ u64 dst_block_num;
+ u64 i;
+ int err;
+
+ if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */
+ return -EINVAL;
+
+ if (level < params->num_levels) {
+ dst_block_num = params->level_start[level];
+ } else {
+ if (WARN_ON(num_blocks_to_hash != 1))
+ return -EINVAL;
+ dst_block_num = 0; /* unused */
+ }
+
+ for (i = 0; i < num_blocks_to_hash; i++) {
+ struct page *src_page;
+
+ if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash)
+ pr_debug("Hashing block %llu of %llu for level %u\n",
+ i + 1, num_blocks_to_hash, level);
+
+ if (level == 0) {
+ /* Leaf: hashing a data block */
+ src_page = read_mapping_page(inode->i_mapping, i, NULL);
+ if (IS_ERR(src_page)) {
+ err = PTR_ERR(src_page);
+ fsverity_err(inode,
+ "Error %d reading data page %llu",
+ err, i);
+ return err;
+ }
+ } else {
+ /* Non-leaf: hashing hash block from level below */
+ src_page = vops->read_merkle_tree_page(inode,
+ params->level_start[level - 1] + i);
+ if (IS_ERR(src_page)) {
+ err = PTR_ERR(src_page);
+ fsverity_err(inode,
+ "Error %d reading Merkle tree page %llu",
+ err, params->level_start[level - 1] + i);
+ return err;
+ }
+ }
+
+ err = fsverity_hash_page(params, inode, req, src_page,
+ &pending_hashes[pending_size]);
+ put_page(src_page);
+ if (err)
+ return err;
+ pending_size += params->digest_size;
+
+ if (level == params->num_levels) /* Root hash? */
+ return 0;
+
+ if (pending_size + params->digest_size > params->block_size ||
+ i + 1 == num_blocks_to_hash) {
+ /* Flush the pending hash block */
+ memset(&pending_hashes[pending_size], 0,
+ params->block_size - pending_size);
+ err = vops->write_merkle_tree_block(inode,
+ pending_hashes,
+ dst_block_num,
+ params->log_blocksize);
+ if (err) {
+ fsverity_err(inode,
+ "Error %d writing Merkle tree block %llu",
+ err, dst_block_num);
+ return err;
+ }
+ dst_block_num++;
+ pending_size = 0;
+ }
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ }
+ return 0;
+}
+
+/*
+ * Build the Merkle tree for the given inode using the given parameters, and
+ * return the root hash in @root_hash.
+ *
+ * The tree is written to a filesystem-specific location as determined by the
+ * ->write_merkle_tree_block() method. However, the blocks that comprise the
+ * tree are the same for all filesystems.
+ */
+static int build_merkle_tree(struct inode *inode,
+ const struct merkle_tree_params *params,
+ u8 *root_hash)
+{
+ u8 *pending_hashes;
+ struct ahash_request *req;
+ u64 blocks;
+ unsigned int level;
+ int err = -ENOMEM;
+
+ if (inode->i_size == 0) {
+ /* Empty file is a special case; root hash is all 0's */
+ memset(root_hash, 0, params->digest_size);
+ return 0;
+ }
+
+ pending_hashes = kmalloc(params->block_size, GFP_KERNEL);
+ req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL);
+ if (!pending_hashes || !req)
+ goto out;
+
+ /*
+ * Build each level of the Merkle tree, starting at the leaf level
+ * (level 0) and ascending to the root node (level 'num_levels - 1').
+ * Then at the end (level 'num_levels'), calculate the root hash.
+ */
+ blocks = (inode->i_size + params->block_size - 1) >>
+ params->log_blocksize;
+ for (level = 0; level <= params->num_levels; level++) {
+ err = build_merkle_tree_level(inode, level, blocks, params,
+ pending_hashes, req);
+ if (err)
+ goto out;
+ blocks = (blocks + params->hashes_per_block - 1) >>
+ params->log_arity;
+ }
+ memcpy(root_hash, pending_hashes, params->digest_size);
+ err = 0;
+out:
+ kfree(pending_hashes);
+ ahash_request_free(req);
+ return err;
+}
+
+static int enable_verity(struct file *filp,
+ const struct fsverity_enable_arg *arg)
+{
+ struct inode *inode = file_inode(filp);
+ const struct fsverity_operations *vops = inode->i_sb->s_vop;
+ struct merkle_tree_params params = { };
+ struct fsverity_descriptor *desc;
+ size_t desc_size = sizeof(*desc) + arg->sig_size;
+ struct fsverity_info *vi;
+ int err;
+
+ /* Start initializing the fsverity_descriptor */
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+ desc->version = 1;
+ desc->hash_algorithm = arg->hash_algorithm;
+ desc->log_blocksize = ilog2(arg->block_size);
+
+ /* Get the salt if the user provided one */
+ if (arg->salt_size &&
+ copy_from_user(desc->salt,
+ (const u8 __user *)(uintptr_t)arg->salt_ptr,
+ arg->salt_size)) {
+ err = -EFAULT;
+ goto out;
+ }
+ desc->salt_size = arg->salt_size;
+
+ /* Get the signature if the user provided one */
+ if (arg->sig_size &&
+ copy_from_user(desc->signature,
+ (const u8 __user *)(uintptr_t)arg->sig_ptr,
+ arg->sig_size)) {
+ err = -EFAULT;
+ goto out;
+ }
+ desc->sig_size = cpu_to_le32(arg->sig_size);
+
+ desc->data_size = cpu_to_le64(inode->i_size);
+
+ /* Prepare the Merkle tree parameters */
+ err = fsverity_init_merkle_tree_params(&params, inode,
+ arg->hash_algorithm,
+ desc->log_blocksize,
+ desc->salt, desc->salt_size);
+ if (err)
+ goto out;
+
+ /*
+ * Start enabling verity on this file, serialized by the inode lock.
+ * Fail if verity is already enabled or is already being enabled.
+ */
+ inode_lock(inode);
+ if (IS_VERITY(inode))
+ err = -EEXIST;
+ else
+ err = vops->begin_enable_verity(filp);
+ inode_unlock(inode);
+ if (err)
+ goto out;
+
+ /*
+ * Build the Merkle tree. Don't hold the inode lock during this, since
+ * on huge files this may take a very long time and we don't want to
+ * force unrelated syscalls like chown() to block forever. We don't
+ * need the inode lock here because deny_write_access() already prevents
+ * the file from being written to or truncated, and we still serialize
+ * ->begin_enable_verity() and ->end_enable_verity() using the inode
+ * lock and only allow one process to be here at a time on a given file.
+ */
+ pr_debug("Building Merkle tree...\n");
+ BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE);
+ err = build_merkle_tree(inode, &params, desc->root_hash);
+ if (err) {
+ fsverity_err(inode, "Error %d building Merkle tree", err);
+ goto rollback;
+ }
+ pr_debug("Done building Merkle tree. Root hash is %s:%*phN\n",
+ params.hash_alg->name, params.digest_size, desc->root_hash);
+
+ /*
+ * Create the fsverity_info. Don't bother trying to save work by
+ * reusing the merkle_tree_params from above. Instead, just create the
+ * fsverity_info from the fsverity_descriptor as if it were just loaded
+ * from disk. This is simpler, and it serves as an extra check that the
+ * metadata we're writing is valid before actually enabling verity.
+ */
+ vi = fsverity_create_info(inode, desc, desc_size);
+ if (IS_ERR(vi)) {
+ err = PTR_ERR(vi);
+ goto rollback;
+ }
+
+ if (arg->sig_size)
+ pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n",
+ arg->sig_size);
+
+ /*
+ * Tell the filesystem to finish enabling verity on the file.
+ * Serialized with ->begin_enable_verity() by the inode lock.
+ */
+ inode_lock(inode);
+ err = vops->end_enable_verity(filp, desc, desc_size, params.tree_size);
+ inode_unlock(inode);
+ if (err) {
+ fsverity_err(inode, "%ps() failed with err %d",
+ vops->end_enable_verity, err);
+ fsverity_free_info(vi);
+ } else if (WARN_ON(!IS_VERITY(inode))) {
+ err = -EINVAL;
+ fsverity_free_info(vi);
+ } else {
+ /* Successfully enabled verity */
+
+ /*
+ * Readers can start using ->i_verity_info immediately, so it
+ * can't be rolled back once set. So don't set it until just
+ * after the filesystem has successfully enabled verity.
+ */
+ fsverity_set_info(inode, vi);
+ }
+out:
+ kfree(params.hashstate);
+ kfree(desc);
+ return err;
+
+rollback:
+ inode_lock(inode);
+ (void)vops->end_enable_verity(filp, NULL, 0, params.tree_size);
+ inode_unlock(inode);
+ goto out;
+}
+
+/**
+ * fsverity_ioctl_enable() - enable verity on a file
+ *
+ * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of
+ * Documentation/filesystems/fsverity.rst for the documentation.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
+{
+ struct inode *inode = file_inode(filp);
+ struct fsverity_enable_arg arg;
+ int err;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.version != 1)
+ return -EINVAL;
+
+ if (arg.__reserved1 ||
+ memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2)))
+ return -EINVAL;
+
+ if (arg.block_size != PAGE_SIZE)
+ return -EINVAL;
+
+ if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt))
+ return -EMSGSIZE;
+
+ if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE)
+ return -EMSGSIZE;
+
+ /*
+ * Require a regular file with write access. But the actual fd must
+ * still be readonly so that we can lock out all writers. This is
+ * needed to guarantee that no writable fds exist to the file once it
+ * has verity enabled, and to stabilize the data being hashed.
+ */
+
+ err = inode_permission(inode, MAY_WRITE);
+ if (err)
+ return err;
+
+ if (IS_APPEND(inode))
+ return -EPERM;
+
+ if (S_ISDIR(inode->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err) /* -EROFS */
+ return err;
+
+ err = deny_write_access(filp);
+ if (err) /* -ETXTBSY */
+ goto out_drop_write;
+
+ err = enable_verity(filp, &arg);
+ if (err)
+ goto out_allow_write_access;
+
+ /*
+ * Some pages of the file may have been evicted from pagecache after
+ * being used in the Merkle tree construction, then read into pagecache
+ * again by another process reading from the file concurrently. Since
+ * these pages didn't undergo verification against the file measurement
+ * which fs-verity now claims to be enforcing, we have to wipe the
+ * pagecache to ensure that all future reads are verified.
+ */
+ filemap_write_and_wait(inode->i_mapping);
+ invalidate_inode_pages2(inode->i_mapping);
+
+ /*
+ * allow_write_access() is needed to pair with deny_write_access().
+ * Regardless, the filesystem won't allow writing to verity files.
+ */
+out_allow_write_access:
+ allow_write_access(filp);
+out_drop_write:
+ mnt_drop_write_file(filp);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_enable);
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
new file mode 100644
index 000000000000..e74c79b64d88
--- /dev/null
+++ b/fs/verity/fsverity_private.h
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs-verity: read-only file-based authenticity protection
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef _FSVERITY_PRIVATE_H
+#define _FSVERITY_PRIVATE_H
+
+#ifdef CONFIG_FS_VERITY_DEBUG
+#define DEBUG
+#endif
+
+#define pr_fmt(fmt) "fs-verity: " fmt
+
+#include <crypto/sha.h>
+#include <linux/fsverity.h>
+
+struct ahash_request;
+
+/*
+ * Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty;
+ * it's enough for over U64_MAX bytes of data using SHA-256 and 4K blocks.
+ */
+#define FS_VERITY_MAX_LEVELS 8
+
+/*
+ * Largest digest size among all hash algorithms supported by fs-verity.
+ * Currently assumed to be <= size of fsverity_descriptor::root_hash.
+ */
+#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
+
+/* A hash algorithm supported by fs-verity */
+struct fsverity_hash_alg {
+ struct crypto_ahash *tfm; /* hash tfm, allocated on demand */
+ const char *name; /* crypto API name, e.g. sha256 */
+ unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
+ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
+};
+
+/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
+struct merkle_tree_params {
+ const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
+ const u8 *hashstate; /* initial hash state or NULL */
+ unsigned int digest_size; /* same as hash_alg->digest_size */
+ unsigned int block_size; /* size of data and tree blocks */
+ unsigned int hashes_per_block; /* number of hashes per tree block */
+ unsigned int log_blocksize; /* log2(block_size) */
+ unsigned int log_arity; /* log2(hashes_per_block) */
+ unsigned int num_levels; /* number of levels in Merkle tree */
+ u64 tree_size; /* Merkle tree size in bytes */
+
+ /*
+ * Starting block index for each tree level, ordered from leaf level (0)
+ * to root level ('num_levels - 1')
+ */
+ u64 level_start[FS_VERITY_MAX_LEVELS];
+};
+
+/**
+ * fsverity_info - cached verity metadata for an inode
+ *
+ * When a verity file is first opened, an instance of this struct is allocated
+ * and stored in ->i_verity_info; it remains until the inode is evicted. It
+ * caches information about the Merkle tree that's needed to efficiently verify
+ * data read from the file. It also caches the file measurement. The Merkle
+ * tree pages themselves are not cached here, but the filesystem may cache them.
+ */
+struct fsverity_info {
+ struct merkle_tree_params tree_params;
+ u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
+ u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
+ const struct inode *inode;
+};
+
+/*
+ * Merkle tree properties. The file measurement is the hash of this structure
+ * excluding the signature and with the sig_size field set to 0.
+ */
+struct fsverity_descriptor {
+ __u8 version; /* must be 1 */
+ __u8 hash_algorithm; /* Merkle tree hash algorithm */
+ __u8 log_blocksize; /* log2 of size of data and tree blocks */
+ __u8 salt_size; /* size of salt in bytes; 0 if none */
+ __le32 sig_size; /* size of signature in bytes; 0 if none */
+ __le64 data_size; /* size of file the Merkle tree is built over */
+ __u8 root_hash[64]; /* Merkle tree root hash */
+ __u8 salt[32]; /* salt prepended to each hashed block */
+ __u8 __reserved[144]; /* must be 0's */
+ __u8 signature[]; /* optional PKCS#7 signature */
+};
+
+/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
+#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
+
+#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
+ sizeof(struct fsverity_descriptor))
+
+/*
+ * Format in which verity file measurements are signed. This is the same as
+ * 'struct fsverity_digest', except here some magic bytes are prepended to
+ * provide some context about what is being signed in case the same key is used
+ * for non-fsverity purposes, and here the fields have fixed endianness.
+ */
+struct fsverity_signed_digest {
+ char magic[8]; /* must be "FSVerity" */
+ __le16 digest_algorithm;
+ __le16 digest_size;
+ __u8 digest[];
+};
+
+/* hash_algs.c */
+
+extern struct fsverity_hash_alg fsverity_hash_algs[];
+
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+ unsigned int num);
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+ const u8 *salt, size_t salt_size);
+int fsverity_hash_page(const struct merkle_tree_params *params,
+ const struct inode *inode,
+ struct ahash_request *req, struct page *page, u8 *out);
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+ const void *data, size_t size, u8 *out);
+void __init fsverity_check_hash_algs(void);
+
+/* init.c */
+
+extern void __printf(3, 4) __cold
+fsverity_msg(const struct inode *inode, const char *level,
+ const char *fmt, ...);
+
+#define fsverity_warn(inode, fmt, ...) \
+ fsverity_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fsverity_err(inode, fmt, ...) \
+ fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
+
+/* open.c */
+
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+ const struct inode *inode,
+ unsigned int hash_algorithm,
+ unsigned int log_blocksize,
+ const u8 *salt, size_t salt_size);
+
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+ void *desc, size_t desc_size);
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
+
+void fsverity_free_info(struct fsverity_info *vi);
+
+int __init fsverity_init_info_cache(void);
+void __init fsverity_exit_info_cache(void);
+
+/* signature.c */
+
+#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+int fsverity_verify_signature(const struct fsverity_info *vi,
+ const struct fsverity_descriptor *desc,
+ size_t desc_size);
+
+int __init fsverity_init_signature(void);
+#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
+static inline int
+fsverity_verify_signature(const struct fsverity_info *vi,
+ const struct fsverity_descriptor *desc,
+ size_t desc_size)
+{
+ return 0;
+}
+
+static inline int fsverity_init_signature(void)
+{
+ return 0;
+}
+#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
+
+/* verify.c */
+
+int __init fsverity_init_workqueue(void);
+void __init fsverity_exit_workqueue(void);
+
+#endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
new file mode 100644
index 000000000000..31e6d7d2389a
--- /dev/null
+++ b/fs/verity/hash_algs.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/hash_algs.c: fs-verity hash algorithms
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/scatterlist.h>
+
+/* The hash algorithms supported by fs-verity */
+struct fsverity_hash_alg fsverity_hash_algs[] = {
+ [FS_VERITY_HASH_ALG_SHA256] = {
+ .name = "sha256",
+ .digest_size = SHA256_DIGEST_SIZE,
+ .block_size = SHA256_BLOCK_SIZE,
+ },
+ [FS_VERITY_HASH_ALG_SHA512] = {
+ .name = "sha512",
+ .digest_size = SHA512_DIGEST_SIZE,
+ .block_size = SHA512_BLOCK_SIZE,
+ },
+};
+
+/**
+ * fsverity_get_hash_alg() - validate and prepare a hash algorithm
+ * @inode: optional inode for logging purposes
+ * @num: the hash algorithm number
+ *
+ * Get the struct fsverity_hash_alg for the given hash algorithm number, and
+ * ensure it has a hash transform ready to go. The hash transforms are
+ * allocated on-demand so that we don't waste resources unnecessarily, and
+ * because the crypto modules may be initialized later than fs/verity/.
+ *
+ * Return: pointer to the hash alg on success, else an ERR_PTR()
+ */
+const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+ unsigned int num)
+{
+ struct fsverity_hash_alg *alg;
+ struct crypto_ahash *tfm;
+ int err;
+
+ if (num >= ARRAY_SIZE(fsverity_hash_algs) ||
+ !fsverity_hash_algs[num].name) {
+ fsverity_warn(inode, "Unknown hash algorithm number: %u", num);
+ return ERR_PTR(-EINVAL);
+ }
+ alg = &fsverity_hash_algs[num];
+
+ /* pairs with cmpxchg() below */
+ tfm = READ_ONCE(alg->tfm);
+ if (likely(tfm != NULL))
+ return alg;
+ /*
+ * Using the shash API would make things a bit simpler, but the ahash
+ * API is preferable as it allows the use of crypto accelerators.
+ */
+ tfm = crypto_alloc_ahash(alg->name, 0, 0);
+ if (IS_ERR(tfm)) {
+ if (PTR_ERR(tfm) == -ENOENT) {
+ fsverity_warn(inode,
+ "Missing crypto API support for hash algorithm \"%s\"",
+ alg->name);
+ return ERR_PTR(-ENOPKG);
+ }
+ fsverity_err(inode,
+ "Error allocating hash algorithm \"%s\": %ld",
+ alg->name, PTR_ERR(tfm));
+ return ERR_CAST(tfm);
+ }
+
+ err = -EINVAL;
+ if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm)))
+ goto err_free_tfm;
+ if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm)))
+ goto err_free_tfm;
+
+ pr_info("%s using implementation \"%s\"\n",
+ alg->name, crypto_ahash_driver_name(tfm));
+
+ /* pairs with READ_ONCE() above */
+ if (cmpxchg(&alg->tfm, NULL, tfm) != NULL)
+ crypto_free_ahash(tfm);
+
+ return alg;
+
+err_free_tfm:
+ crypto_free_ahash(tfm);
+ return ERR_PTR(err);
+}
+
+/**
+ * fsverity_prepare_hash_state() - precompute the initial hash state
+ * @alg: hash algorithm
+ * @salt: a salt which is to be prepended to all data to be hashed
+ * @salt_size: salt size in bytes, possibly 0
+ *
+ * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
+ * initial hash state on success or an ERR_PTR() on failure.
+ */
+const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+ const u8 *salt, size_t salt_size)
+{
+ u8 *hashstate = NULL;
+ struct ahash_request *req = NULL;
+ u8 *padded_salt = NULL;
+ size_t padded_salt_size;
+ struct scatterlist sg;
+ DECLARE_CRYPTO_WAIT(wait);
+ int err;
+
+ if (salt_size == 0)
+ return NULL;
+
+ hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL);
+ if (!hashstate)
+ return ERR_PTR(-ENOMEM);
+
+ req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
+ if (!req) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ /*
+ * Zero-pad the salt to the next multiple of the input size of the hash
+ * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128
+ * bytes for SHA-512. This ensures that the hash algorithm won't have
+ * any bytes buffered internally after processing the salt, thus making
+ * salted hashing just as fast as unsalted hashing.
+ */
+ padded_salt_size = round_up(salt_size, alg->block_size);
+ padded_salt = kzalloc(padded_salt_size, GFP_KERNEL);
+ if (!padded_salt) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+ memcpy(padded_salt, salt, salt_size);
+
+ sg_init_one(&sg, padded_salt, padded_salt_size);
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+ ahash_request_set_crypt(req, &sg, NULL, padded_salt_size);
+
+ err = crypto_wait_req(crypto_ahash_init(req), &wait);
+ if (err)
+ goto err_free;
+
+ err = crypto_wait_req(crypto_ahash_update(req), &wait);
+ if (err)
+ goto err_free;
+
+ err = crypto_ahash_export(req, hashstate);
+ if (err)
+ goto err_free;
+out:
+ ahash_request_free(req);
+ kfree(padded_salt);
+ return hashstate;
+
+err_free:
+ kfree(hashstate);
+ hashstate = ERR_PTR(err);
+ goto out;
+}
+
+/**
+ * fsverity_hash_page() - hash a single data or hash page
+ * @params: the Merkle tree's parameters
+ * @inode: inode for which the hashing is being done
+ * @req: preallocated hash request
+ * @page: the page to hash
+ * @out: output digest, size 'params->digest_size' bytes
+ *
+ * Hash a single data or hash block, assuming block_size == PAGE_SIZE.
+ * The hash is salted if a salt is specified in the Merkle tree parameters.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_hash_page(const struct merkle_tree_params *params,
+ const struct inode *inode,
+ struct ahash_request *req, struct page *page, u8 *out)
+{
+ struct scatterlist sg;
+ DECLARE_CRYPTO_WAIT(wait);
+ int err;
+
+ if (WARN_ON(params->block_size != PAGE_SIZE))
+ return -EINVAL;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg, page, PAGE_SIZE, 0);
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+ ahash_request_set_crypt(req, &sg, out, PAGE_SIZE);
+
+ if (params->hashstate) {
+ err = crypto_ahash_import(req, params->hashstate);
+ if (err) {
+ fsverity_err(inode,
+ "Error %d importing hash state", err);
+ return err;
+ }
+ err = crypto_ahash_finup(req);
+ } else {
+ err = crypto_ahash_digest(req);
+ }
+
+ err = crypto_wait_req(err, &wait);
+ if (err)
+ fsverity_err(inode, "Error %d computing page hash", err);
+ return err;
+}
+
+/**
+ * fsverity_hash_buffer() - hash some data
+ * @alg: the hash algorithm to use
+ * @data: the data to hash
+ * @size: size of data to hash, in bytes
+ * @out: output digest, size 'alg->digest_size' bytes
+ *
+ * Hash some data which is located in physically contiguous memory (i.e. memory
+ * allocated by kmalloc(), not by vmalloc()). No salt is used.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+ const void *data, size_t size, u8 *out)
+{
+ struct ahash_request *req;
+ struct scatterlist sg;
+ DECLARE_CRYPTO_WAIT(wait);
+ int err;
+
+ req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ sg_init_one(&sg, data, size);
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+ ahash_request_set_crypt(req, &sg, out, size);
+
+ err = crypto_wait_req(crypto_ahash_digest(req), &wait);
+
+ ahash_request_free(req);
+ return err;
+}
+
+void __init fsverity_check_hash_algs(void)
+{
+ size_t i;
+
+ /*
+ * Sanity check the hash algorithms (could be a build-time check, but
+ * they're in an array)
+ */
+ for (i = 0; i < ARRAY_SIZE(fsverity_hash_algs); i++) {
+ const struct fsverity_hash_alg *alg = &fsverity_hash_algs[i];
+
+ if (!alg->name)
+ continue;
+
+ BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE);
+
+ /*
+ * For efficiency, the implementation currently assumes the
+ * digest and block sizes are powers of 2. This limitation can
+ * be lifted if the code is updated to handle other values.
+ */
+ BUG_ON(!is_power_of_2(alg->digest_size));
+ BUG_ON(!is_power_of_2(alg->block_size));
+ }
+}
diff --git a/fs/verity/init.c b/fs/verity/init.c
new file mode 100644
index 000000000000..94c104e00861
--- /dev/null
+++ b/fs/verity/init.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/init.c: fs-verity module initialization and logging
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/ratelimit.h>
+
+void fsverity_msg(const struct inode *inode, const char *level,
+ const char *fmt, ...)
+{
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&rs))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (inode)
+ printk("%sfs-verity (%s, inode %lu): %pV\n",
+ level, inode->i_sb->s_id, inode->i_ino, &vaf);
+ else
+ printk("%sfs-verity: %pV\n", level, &vaf);
+ va_end(args);
+}
+
+static int __init fsverity_init(void)
+{
+ int err;
+
+ fsverity_check_hash_algs();
+
+ err = fsverity_init_info_cache();
+ if (err)
+ return err;
+
+ err = fsverity_init_workqueue();
+ if (err)
+ goto err_exit_info_cache;
+
+ err = fsverity_init_signature();
+ if (err)
+ goto err_exit_workqueue;
+
+ pr_debug("Initialized fs-verity\n");
+ return 0;
+
+err_exit_workqueue:
+ fsverity_exit_workqueue();
+err_exit_info_cache:
+ fsverity_exit_info_cache();
+ return err;
+}
+late_initcall(fsverity_init)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
new file mode 100644
index 000000000000..05049b68c745
--- /dev/null
+++ b/fs/verity/measure.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/measure.c: ioctl to get a verity file's measurement
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/uaccess.h>
+
+/**
+ * fsverity_ioctl_measure() - get a verity file's measurement
+ *
+ * Retrieve the file measurement that the kernel is enforcing for reads from a
+ * verity file. See the "FS_IOC_MEASURE_VERITY" section of
+ * Documentation/filesystems/fsverity.rst for the documentation.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
+{
+ const struct inode *inode = file_inode(filp);
+ struct fsverity_digest __user *uarg = _uarg;
+ const struct fsverity_info *vi;
+ const struct fsverity_hash_alg *hash_alg;
+ struct fsverity_digest arg;
+
+ vi = fsverity_get_info(inode);
+ if (!vi)
+ return -ENODATA; /* not a verity file */
+ hash_alg = vi->tree_params.hash_alg;
+
+ /*
+ * The user specifies the digest_size their buffer has space for; we can
+ * return the digest if it fits in the available space. We write back
+ * the actual size, which may be shorter than the user-specified size.
+ */
+
+ if (get_user(arg.digest_size, &uarg->digest_size))
+ return -EFAULT;
+ if (arg.digest_size < hash_alg->digest_size)
+ return -EOVERFLOW;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.digest_algorithm = hash_alg - fsverity_hash_algs;
+ arg.digest_size = hash_alg->digest_size;
+
+ if (copy_to_user(uarg, &arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
diff --git a/fs/verity/open.c b/fs/verity/open.c
new file mode 100644
index 000000000000..63d1004b688c
--- /dev/null
+++ b/fs/verity/open.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/open.c: opening fs-verity files
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/slab.h>
+
+static struct kmem_cache *fsverity_info_cachep;
+
+/**
+ * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters
+ * @params: the parameters struct to initialize
+ * @inode: the inode for which the Merkle tree is being built
+ * @hash_algorithm: number of hash algorithm to use
+ * @log_blocksize: log base 2 of block size to use
+ * @salt: pointer to salt (optional)
+ * @salt_size: size of salt, possibly 0
+ *
+ * Validate the hash algorithm and block size, then compute the tree topology
+ * (num levels, num blocks in each level, etc.) and initialize @params.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
+ const struct inode *inode,
+ unsigned int hash_algorithm,
+ unsigned int log_blocksize,
+ const u8 *salt, size_t salt_size)
+{
+ const struct fsverity_hash_alg *hash_alg;
+ int err;
+ u64 blocks;
+ u64 offset;
+ int level;
+
+ memset(params, 0, sizeof(*params));
+
+ hash_alg = fsverity_get_hash_alg(inode, hash_algorithm);
+ if (IS_ERR(hash_alg))
+ return PTR_ERR(hash_alg);
+ params->hash_alg = hash_alg;
+ params->digest_size = hash_alg->digest_size;
+
+ params->hashstate = fsverity_prepare_hash_state(hash_alg, salt,
+ salt_size);
+ if (IS_ERR(params->hashstate)) {
+ err = PTR_ERR(params->hashstate);
+ params->hashstate = NULL;
+ fsverity_err(inode, "Error %d preparing hash state", err);
+ goto out_err;
+ }
+
+ if (log_blocksize != PAGE_SHIFT) {
+ fsverity_warn(inode, "Unsupported log_blocksize: %u",
+ log_blocksize);
+ err = -EINVAL;
+ goto out_err;
+ }
+ params->log_blocksize = log_blocksize;
+ params->block_size = 1 << log_blocksize;
+
+ if (WARN_ON(!is_power_of_2(params->digest_size))) {
+ err = -EINVAL;
+ goto out_err;
+ }
+ if (params->block_size < 2 * params->digest_size) {
+ fsverity_warn(inode,
+ "Merkle tree block size (%u) too small for hash algorithm \"%s\"",
+ params->block_size, hash_alg->name);
+ err = -EINVAL;
+ goto out_err;
+ }
+ params->log_arity = params->log_blocksize - ilog2(params->digest_size);
+ params->hashes_per_block = 1 << params->log_arity;
+
+ pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n",
+ hash_alg->name, params->block_size, params->hashes_per_block,
+ (int)salt_size, salt);
+
+ /*
+ * Compute the number of levels in the Merkle tree and create a map from
+ * level to the starting block of that level. Level 'num_levels - 1' is
+ * the root and is stored first. Level 0 is the level directly "above"
+ * the data blocks and is stored last.
+ */
+
+ /* Compute number of levels and the number of blocks in each level */
+ blocks = (inode->i_size + params->block_size - 1) >> log_blocksize;
+ pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
+ while (blocks > 1) {
+ if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
+ fsverity_err(inode, "Too many levels in Merkle tree");
+ err = -EINVAL;
+ goto out_err;
+ }
+ blocks = (blocks + params->hashes_per_block - 1) >>
+ params->log_arity;
+ /* temporarily using level_start[] to store blocks in level */
+ params->level_start[params->num_levels++] = blocks;
+ }
+
+ /* Compute the starting block of each level */
+ offset = 0;
+ for (level = (int)params->num_levels - 1; level >= 0; level--) {
+ blocks = params->level_start[level];
+ params->level_start[level] = offset;
+ pr_debug("Level %d is %llu blocks starting at index %llu\n",
+ level, blocks, offset);
+ offset += blocks;
+ }
+
+ params->tree_size = offset << log_blocksize;
+ return 0;
+
+out_err:
+ kfree(params->hashstate);
+ memset(params, 0, sizeof(*params));
+ return err;
+}
+
+/*
+ * Compute the file measurement by hashing the fsverity_descriptor excluding the
+ * signature and with the sig_size field set to 0.
+ */
+static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
+ struct fsverity_descriptor *desc,
+ u8 *measurement)
+{
+ __le32 sig_size = desc->sig_size;
+ int err;
+
+ desc->sig_size = 0;
+ err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+ desc->sig_size = sig_size;
+
+ return err;
+}
+
+/*
+ * Validate the given fsverity_descriptor and create a new fsverity_info from
+ * it. The signature (if present) is also checked.
+ */
+struct fsverity_info *fsverity_create_info(const struct inode *inode,
+ void *_desc, size_t desc_size)
+{
+ struct fsverity_descriptor *desc = _desc;
+ struct fsverity_info *vi;
+ int err;
+
+ if (desc_size < sizeof(*desc)) {
+ fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
+ desc_size);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (desc->version != 1) {
+ fsverity_err(inode, "Unrecognized descriptor version: %u",
+ desc->version);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+ fsverity_err(inode, "Reserved bits set in descriptor");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (desc->salt_size > sizeof(desc->salt)) {
+ fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (le64_to_cpu(desc->data_size) != inode->i_size) {
+ fsverity_err(inode,
+ "Wrong data_size: %llu (desc) != %lld (inode)",
+ le64_to_cpu(desc->data_size), inode->i_size);
+ return ERR_PTR(-EINVAL);
+ }
+
+ vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
+ if (!vi)
+ return ERR_PTR(-ENOMEM);
+ vi->inode = inode;
+
+ err = fsverity_init_merkle_tree_params(&vi->tree_params, inode,
+ desc->hash_algorithm,
+ desc->log_blocksize,
+ desc->salt, desc->salt_size);
+ if (err) {
+ fsverity_err(inode,
+ "Error %d initializing Merkle tree parameters",
+ err);
+ goto out;
+ }
+
+ memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
+
+ err = compute_file_measurement(vi->tree_params.hash_alg, desc,
+ vi->measurement);
+ if (err) {
+ fsverity_err(inode, "Error %d computing file measurement", err);
+ goto out;
+ }
+ pr_debug("Computed file measurement: %s:%*phN\n",
+ vi->tree_params.hash_alg->name,
+ vi->tree_params.digest_size, vi->measurement);
+
+ err = fsverity_verify_signature(vi, desc, desc_size);
+out:
+ if (err) {
+ fsverity_free_info(vi);
+ vi = ERR_PTR(err);
+ }
+ return vi;
+}
+
+void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
+{
+ /*
+ * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
+ * This pairs with the READ_ONCE() in fsverity_get_info().
+ */
+ if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
+ fsverity_free_info(vi);
+}
+
+void fsverity_free_info(struct fsverity_info *vi)
+{
+ if (!vi)
+ return;
+ kfree(vi->tree_params.hashstate);
+ kmem_cache_free(fsverity_info_cachep, vi);
+}
+
+/* Ensure the inode has an ->i_verity_info */
+static int ensure_verity_info(struct inode *inode)
+{
+ struct fsverity_info *vi = fsverity_get_info(inode);
+ struct fsverity_descriptor *desc;
+ int res;
+
+ if (vi)
+ return 0;
+
+ res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
+ if (res < 0) {
+ fsverity_err(inode,
+ "Error %d getting verity descriptor size", res);
+ return res;
+ }
+ if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ fsverity_err(inode, "Verity descriptor is too large (%d bytes)",
+ res);
+ return -EMSGSIZE;
+ }
+ desc = kmalloc(res, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+ res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
+ if (res < 0) {
+ fsverity_err(inode, "Error %d reading verity descriptor", res);
+ goto out_free_desc;
+ }
+
+ vi = fsverity_create_info(inode, desc, res);
+ if (IS_ERR(vi)) {
+ res = PTR_ERR(vi);
+ goto out_free_desc;
+ }
+
+ fsverity_set_info(inode, vi);
+ res = 0;
+out_free_desc:
+ kfree(desc);
+ return res;
+}
+
+/**
+ * fsverity_file_open() - prepare to open a verity file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * When opening a verity file, deny the open if it is for writing. Otherwise,
+ * set up the inode's ->i_verity_info if not already done.
+ *
+ * When combined with fscrypt, this must be called after fscrypt_file_open().
+ * Otherwise, we won't have the key set up to decrypt the verity metadata.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_file_open(struct inode *inode, struct file *filp)
+{
+ if (!IS_VERITY(inode))
+ return 0;
+
+ if (filp->f_mode & FMODE_WRITE) {
+ pr_debug("Denying opening verity file (ino %lu) for write\n",
+ inode->i_ino);
+ return -EPERM;
+ }
+
+ return ensure_verity_info(inode);
+}
+EXPORT_SYMBOL_GPL(fsverity_file_open);
+
+/**
+ * fsverity_prepare_setattr() - prepare to change a verity inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Verity files are immutable, so deny truncates. This isn't covered by the
+ * open-time check because sys_truncate() takes a path, not a file descriptor.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) {
+ pr_debug("Denying truncate of verity file (ino %lu)\n",
+ d_inode(dentry)->i_ino);
+ return -EPERM;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_prepare_setattr);
+
+/**
+ * fsverity_cleanup_inode() - free the inode's verity info, if present
+ *
+ * Filesystems must call this on inode eviction to free ->i_verity_info.
+ */
+void fsverity_cleanup_inode(struct inode *inode)
+{
+ fsverity_free_info(inode->i_verity_info);
+ inode->i_verity_info = NULL;
+}
+EXPORT_SYMBOL_GPL(fsverity_cleanup_inode);
+
+int __init fsverity_init_info_cache(void)
+{
+ fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
+ SLAB_RECLAIM_ACCOUNT,
+ measurement);
+ if (!fsverity_info_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void __init fsverity_exit_info_cache(void)
+{
+ kmem_cache_destroy(fsverity_info_cachep);
+ fsverity_info_cachep = NULL;
+}
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
new file mode 100644
index 000000000000..c8b255232de5
--- /dev/null
+++ b/fs/verity/signature.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/signature.c: verification of builtin signatures
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/cred.h>
+#include <linux/key.h>
+#include <linux/slab.h>
+#include <linux/verification.h>
+
+/*
+ * /proc/sys/fs/verity/require_signatures
+ * If 1, all verity files must have a valid builtin signature.
+ */
+static int fsverity_require_signatures;
+
+/*
+ * Keyring that contains the trusted X.509 certificates.
+ *
+ * Only root (kuid=0) can modify this. Also, root may use
+ * keyctl_restrict_keyring() to prevent any more additions.
+ */
+static struct key *fsverity_keyring;
+
+/**
+ * fsverity_verify_signature() - check a verity file's signature
+ *
+ * If the file's fs-verity descriptor includes a signature of the file
+ * measurement, verify it against the certificates in the fs-verity keyring.
+ *
+ * Return: 0 on success (signature valid or not required); -errno on failure
+ */
+int fsverity_verify_signature(const struct fsverity_info *vi,
+ const struct fsverity_descriptor *desc,
+ size_t desc_size)
+{
+ const struct inode *inode = vi->inode;
+ const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
+ const u32 sig_size = le32_to_cpu(desc->sig_size);
+ struct fsverity_signed_digest *d;
+ int err;
+
+ if (sig_size == 0) {
+ if (fsverity_require_signatures) {
+ fsverity_err(inode,
+ "require_signatures=1, rejecting unsigned file!");
+ return -EPERM;
+ }
+ return 0;
+ }
+
+ if (sig_size > desc_size - sizeof(*desc)) {
+ fsverity_err(inode, "Signature overflows verity descriptor");
+ return -EBADMSG;
+ }
+
+ d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+ memcpy(d->magic, "FSVerity", 8);
+ d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs);
+ d->digest_size = cpu_to_le16(hash_alg->digest_size);
+ memcpy(d->digest, vi->measurement, hash_alg->digest_size);
+
+ err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
+ desc->signature, sig_size,
+ fsverity_keyring,
+ VERIFYING_UNSPECIFIED_SIGNATURE,
+ NULL, NULL);
+ kfree(d);
+
+ if (err) {
+ if (err == -ENOKEY)
+ fsverity_err(inode,
+ "File's signing cert isn't in the fs-verity keyring");
+ else if (err == -EKEYREJECTED)
+ fsverity_err(inode, "Incorrect file signature");
+ else if (err == -EBADMSG)
+ fsverity_err(inode, "Malformed file signature");
+ else
+ fsverity_err(inode, "Error %d verifying file signature",
+ err);
+ return err;
+ }
+
+ pr_debug("Valid signature for file measurement %s:%*phN\n",
+ hash_alg->name, hash_alg->digest_size, vi->measurement);
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fsverity_sysctl_header;
+
+static const struct ctl_path fsverity_sysctl_path[] = {
+ { .procname = "fs", },
+ { .procname = "verity", },
+ { }
+};
+
+static struct ctl_table fsverity_sysctl_table[] = {
+ {
+ .procname = "require_signatures",
+ .data = &fsverity_require_signatures,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static int __init fsverity_sysctl_init(void)
+{
+ fsverity_sysctl_header = register_sysctl_paths(fsverity_sysctl_path,
+ fsverity_sysctl_table);
+ if (!fsverity_sysctl_header) {
+ pr_err("sysctl registration failed!\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+#else /* !CONFIG_SYSCTL */
+static inline int __init fsverity_sysctl_init(void)
+{
+ return 0;
+}
+#endif /* !CONFIG_SYSCTL */
+
+int __init fsverity_init_signature(void)
+{
+ struct key *ring;
+ int err;
+
+ ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(), KEY_POS_SEARCH |
+ KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE |
+ KEY_USR_SEARCH | KEY_USR_SETATTR,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(ring))
+ return PTR_ERR(ring);
+
+ err = fsverity_sysctl_init();
+ if (err)
+ goto err_put_ring;
+
+ fsverity_keyring = ring;
+ return 0;
+
+err_put_ring:
+ key_put(ring);
+ return err;
+}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
new file mode 100644
index 000000000000..3e8f2de44667
--- /dev/null
+++ b/fs/verity/verify.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages()
+ *
+ * Copyright 2019 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <crypto/hash.h>
+#include <linux/bio.h>
+#include <linux/ratelimit.h>
+
+static struct workqueue_struct *fsverity_read_workqueue;
+
+/**
+ * hash_at_level() - compute the location of the block's hash at the given level
+ *
+ * @params: (in) the Merkle tree parameters
+ * @dindex: (in) the index of the data block being verified
+ * @level: (in) the level of hash we want (0 is leaf level)
+ * @hindex: (out) the index of the hash block containing the wanted hash
+ * @hoffset: (out) the byte offset to the wanted hash within the hash block
+ */
+static void hash_at_level(const struct merkle_tree_params *params,
+ pgoff_t dindex, unsigned int level, pgoff_t *hindex,
+ unsigned int *hoffset)
+{
+ pgoff_t position;
+
+ /* Offset of the hash within the level's region, in hashes */
+ position = dindex >> (level * params->log_arity);
+
+ /* Index of the hash block in the tree overall */
+ *hindex = params->level_start[level] + (position >> params->log_arity);
+
+ /* Offset of the wanted hash (in bytes) within the hash block */
+ *hoffset = (position & ((1 << params->log_arity) - 1)) <<
+ (params->log_blocksize - params->log_arity);
+}
+
+/* Extract a hash from a hash page */
+static void extract_hash(struct page *hpage, unsigned int hoffset,
+ unsigned int hsize, u8 *out)
+{
+ void *virt = kmap_atomic(hpage);
+
+ memcpy(out, virt + hoffset, hsize);
+ kunmap_atomic(virt);
+}
+
+static inline int cmp_hashes(const struct fsverity_info *vi,
+ const u8 *want_hash, const u8 *real_hash,
+ pgoff_t index, int level)
+{
+ const unsigned int hsize = vi->tree_params.digest_size;
+
+ if (memcmp(want_hash, real_hash, hsize) == 0)
+ return 0;
+
+ fsverity_err(vi->inode,
+ "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+ index, level,
+ vi->tree_params.hash_alg->name, hsize, want_hash,
+ vi->tree_params.hash_alg->name, hsize, real_hash);
+ return -EBADMSG;
+}
+
+/*
+ * Verify a single data page against the file's Merkle tree.
+ *
+ * In principle, we need to verify the entire path to the root node. However,
+ * for efficiency the filesystem may cache the hash pages. Therefore we need
+ * only ascend the tree until an already-verified page is seen, as indicated by
+ * the PageChecked bit being set; then verify the path to that page.
+ *
+ * This code currently only supports the case where the verity block size is
+ * equal to PAGE_SIZE. Doing otherwise would be possible but tricky, since we
+ * wouldn't be able to use the PageChecked bit.
+ *
+ * Note that multiple processes may race to verify a hash page and mark it
+ * Checked, but it doesn't matter; the result will be the same either way.
+ *
+ * Return: true if the page is valid, else false.
+ */
+static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
+ struct ahash_request *req, struct page *data_page)
+{
+ const struct merkle_tree_params *params = &vi->tree_params;
+ const unsigned int hsize = params->digest_size;
+ const pgoff_t index = data_page->index;
+ int level;
+ u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
+ const u8 *want_hash;
+ u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+ struct page *hpages[FS_VERITY_MAX_LEVELS];
+ unsigned int hoffsets[FS_VERITY_MAX_LEVELS];
+ int err;
+
+ if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page)))
+ return false;
+
+ pr_debug_ratelimited("Verifying data page %lu...\n", index);
+
+ /*
+ * Starting at the leaf level, ascend the tree saving hash pages along
+ * the way until we find a verified hash page, indicated by PageChecked;
+ * or until we reach the root.
+ */
+ for (level = 0; level < params->num_levels; level++) {
+ pgoff_t hindex;
+ unsigned int hoffset;
+ struct page *hpage;
+
+ hash_at_level(params, index, level, &hindex, &hoffset);
+
+ pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n",
+ level, hindex, hoffset);
+
+ hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
+ hindex);
+ if (IS_ERR(hpage)) {
+ err = PTR_ERR(hpage);
+ fsverity_err(inode,
+ "Error %d reading Merkle tree page %lu",
+ err, hindex);
+ goto out;
+ }
+
+ if (PageChecked(hpage)) {
+ extract_hash(hpage, hoffset, hsize, _want_hash);
+ want_hash = _want_hash;
+ put_page(hpage);
+ pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
+ params->hash_alg->name,
+ hsize, want_hash);
+ goto descend;
+ }
+ pr_debug_ratelimited("Hash page not yet checked\n");
+ hpages[level] = hpage;
+ hoffsets[level] = hoffset;
+ }
+
+ want_hash = vi->root_hash;
+ pr_debug("Want root hash: %s:%*phN\n",
+ params->hash_alg->name, hsize, want_hash);
+descend:
+ /* Descend the tree verifying hash pages */
+ for (; level > 0; level--) {
+ struct page *hpage = hpages[level - 1];
+ unsigned int hoffset = hoffsets[level - 1];
+
+ err = fsverity_hash_page(params, inode, req, hpage, real_hash);
+ if (err)
+ goto out;
+ err = cmp_hashes(vi, want_hash, real_hash, index, level - 1);
+ if (err)
+ goto out;
+ SetPageChecked(hpage);
+ extract_hash(hpage, hoffset, hsize, _want_hash);
+ want_hash = _want_hash;
+ put_page(hpage);
+ pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
+ level - 1, params->hash_alg->name, hsize, want_hash);
+ }
+
+ /* Finally, verify the data page */
+ err = fsverity_hash_page(params, inode, req, data_page, real_hash);
+ if (err)
+ goto out;
+ err = cmp_hashes(vi, want_hash, real_hash, index, -1);
+out:
+ for (; level > 0; level--)
+ put_page(hpages[level - 1]);
+
+ return err == 0;
+}
+
+/**
+ * fsverity_verify_page() - verify a data page
+ *
+ * Verify a page that has just been read from a verity file. The page must be a
+ * pagecache page that is still locked and not yet uptodate.
+ *
+ * Return: true if the page is valid, else false.
+ */
+bool fsverity_verify_page(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ const struct fsverity_info *vi = inode->i_verity_info;
+ struct ahash_request *req;
+ bool valid;
+
+ req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
+ if (unlikely(!req))
+ return false;
+
+ valid = verify_page(inode, vi, req, page);
+
+ ahash_request_free(req);
+
+ return valid;
+}
+EXPORT_SYMBOL_GPL(fsverity_verify_page);
+
+#ifdef CONFIG_BLOCK
+/**
+ * fsverity_verify_bio() - verify a 'read' bio that has just completed
+ *
+ * Verify a set of pages that have just been read from a verity file. The pages
+ * must be pagecache pages that are still locked and not yet uptodate. Pages
+ * that fail verification are set to the Error state. Verification is skipped
+ * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ *
+ * This is a helper function for use by the ->readpages() method of filesystems
+ * that issue bios to read data directly into the page cache. Filesystems that
+ * populate the page cache without issuing bios (e.g. non block-based
+ * filesystems) must instead call fsverity_verify_page() directly on each page.
+ * All filesystems must also call fsverity_verify_page() on holes.
+ */
+void fsverity_verify_bio(struct bio *bio)
+{
+ struct inode *inode = bio_first_page_all(bio)->mapping->host;
+ const struct fsverity_info *vi = inode->i_verity_info;
+ struct ahash_request *req;
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
+ if (unlikely(!req)) {
+ bio_for_each_segment_all(bv, bio, iter_all)
+ SetPageError(bv->bv_page);
+ return;
+ }
+
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ struct page *page = bv->bv_page;
+
+ if (!PageError(page) && !verify_page(inode, vi, req, page))
+ SetPageError(page);
+ }
+
+ ahash_request_free(req);
+}
+EXPORT_SYMBOL_GPL(fsverity_verify_bio);
+#endif /* CONFIG_BLOCK */
+
+/**
+ * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue
+ *
+ * Enqueue verification work for asynchronous processing.
+ */
+void fsverity_enqueue_verify_work(struct work_struct *work)
+{
+ queue_work(fsverity_read_workqueue, work);
+}
+EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work);
+
+int __init fsverity_init_workqueue(void)
+{
+ /*
+ * Use an unbound workqueue to allow bios to be verified in parallel
+ * even when they happen to complete on the same CPU. This sacrifices
+ * locality, but it's worthwhile since hashing is CPU-intensive.
+ *
+ * Also use a high-priority workqueue to prioritize verification work,
+ * which blocks reads from completing, over regular application tasks.
+ */
+ fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
+ WQ_UNBOUND | WQ_HIGHPRI,
+ num_online_cpus());
+ if (!fsverity_read_workqueue)
+ return -ENOMEM;
+ return 0;
+}
+
+void __init fsverity_exit_workqueue(void)
+{
+ destroy_workqueue(fsverity_read_workqueue);
+ fsverity_read_workqueue = NULL;
+}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6f7848cd5527..affa557c2337 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -67,7 +67,7 @@ xfs_find_handle(
return -EBADF;
inode = file_inode(f.file);
} else {
- error = user_lpath((const char __user *)hreq->path, &path);
+ error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
return error;
inode = d_inode(path.dentry);